In [None]:
!pip install gensim

import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [None]:
df = pd.read_csv("dataset.csv")

In [None]:
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


In [None]:
df["evidence"] = df['genre']+" "+df['overview']+" "+str(df['vote_average']) + " " + df['original_language']

In [None]:
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,evidence
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,"Drama,Crime In the continuing saga of the Corl..."


In [None]:
df = df[['id','title','evidence']]

In [None]:
df.head()

Unnamed: 0,id,title,evidence
0,278,The Shawshank Redemption,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime In the continuing saga of the Corl..."


In [None]:
df['tokens'] = (
    df['evidence']
    .str.lower()                              # make lowercase
    .str.replace('[^a-z ]', '', regex=True)   # remove punctuation
    .str.split()                              # split into word list
)


In [None]:
df = df[df['tokens'].apply(lambda x: isinstance(x, list))]

model = Word2Vec(
    sentences = df['tokens'],
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [None]:
def get_average(tokens,model):
  count = 0
  arr = np.zeros(100)
  for token in tokens:
    if token in model.wv:
      arr+=model.wv[token]
      count+=1

  if count > 0:
    return arr/count
  else:
    return arr

df['movie_vector'] = df['tokens'].apply(lambda tokens: get_average(tokens, model))

In [None]:
def recommend(title , df , top_n=5):
  if title not in df['title'].values:
    return f"{title} not in dataset"

  # movie_vec = df['movie_vector'][df['title'].values.tolist().index(title)] can also do this we just want to get the vector at of the movie we entered
  movie_vec = df[df['title'] == title]['movie_vector'].values[0].reshape((1,-1))

  all_vectors = np.vstack(df['movie_vector'].values)

  similarities = cosine_similarity(movie_vec, all_vectors)[0]

  df['similarities'] = similarities
  recommendation = (
      df[df['title'] != title]
      .sort_values(by = 'similarities' , ascending = False)
      .head(top_n)[['title','similarities']]
  )
  return recommendation

In [None]:
recommend("Iron Man" , df)

Unnamed: 0,title,similarities
4773,The First Great Train Robbery,0.998583
442,To Be or Not to Be,0.998443
2458,The Haunting,0.998372
2811,Once Upon a Deadpool,0.998341
9379,The Man Who Killed Hitler and Then the Bigfoot,0.998339
