# Importing Libraries

In [45]:
import numpy as numpy
import pandas as pd

# Importing Dataset

In [46]:
movies=pd.read_csv('./dataset/tmdb_5000_movies.csv')
credits=pd.read_csv('./dataset/tmdb_5000_credits.csv')

In [47]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [48]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# Preprocessing

In [49]:
# merging dataframes
movies = movies.merge(credits,on='title')

In [50]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [51]:
# only selecting genres,id,keywords,title,overview,cast,crew
# to check any value with count use movies['param'].value_counts()
# todo: incorporating release_date

movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [52]:
# collecting data to handle null errors
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [53]:
# droping the null values
movies.dropna(inplace=True)

In [54]:
# checking for duplicate data
movies.duplicated().sum()

0

In [55]:
# to make genre and keywords into ['abc','def'] format
import ast

def preprocess_keywords_and_genres(obj):
    values=[]
    for dic in ast.literal_eval(obj): # converting values from string of list to list
        values.append(dic['name'])
    return values

In [56]:
movies['genres'] = movies['genres'].apply(preprocess_keywords_and_genres)

In [57]:
movies['keywords'] = movies['keywords'].apply(preprocess_keywords_and_genres)

In [58]:
# to get the first 3 actors from the cast

def preprocess_cast(obj):
    values=[]
    counter=0
    for dic in ast.literal_eval(obj): # converting values from string of list to list
        if counter!=3: values.append(dic['name']); counter+=1
        else: break
    return values

In [59]:
movies['cast'] = movies['cast'].apply(preprocess_cast)

In [60]:
# FETCHING ONLY DIRECTOR JOB ROLE FROM CREW

def preprocess_crew(obj):
    values=[]
    counter=0
    for dic in ast.literal_eval(obj): # converting values from string of list to list
        if dic['job']=='Director': values.append(dic['name']); break
    return values

In [61]:
movies['crew'] = movies['crew'].apply(preprocess_crew)

In [62]:
# converting overview from string to list for easy concatenation
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [63]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [70]:
# removing spaces in the names of cast crew and other strings to avoid wrong interpretation
movies.iloc[:,3:] = movies.iloc[:,3:].apply(lambda col: col.apply(lambda x: [i.replace(" ","") for i in x]))

In [72]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [75]:
new_df = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [77]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [78]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

# Training Word2Vec Model

In [86]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


In [87]:
def lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

In [88]:
new_df['tags'] = new_df['tags'].apply(lemmatize)

In [89]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
nltk.download('punkt') 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [91]:
tokenized_tags = new_df['tags'].apply(word_tokenize)

In [92]:
# Train Word2Vec model
model = Word2Vec(sentences=tokenized_tags, vector_size=100, window=5, min_count=1, workers=4)

In [93]:
# Function to get the vector representation of a text
def get_vector(text):
    tokens = word_tokenize(text)
    vector = sum(model.wv[word] for word in tokens if word in model.wv)
    return vector

In [94]:
# Apply the function to get vectors for each tag
word2vec_vectors = new_df['tags'].apply(get_vector).to_list()

# Convert the list of vectors to a numpy array
vector = np.array(word2vec_vectors)

In [96]:
from sklearn.metrics.pairwise import cosine_similarity

In [97]:
similarity = cosine_similarity(vector)

In [100]:
def recommend(movie):
    movie_idx = new_df[new_df['title']==movie].index[0] # new_df[new_df['title']=='Avatar'].index[0] is used to find index of movie
    distances = similarity[movie_idx]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6] # list(enumerate(distances))-> to sort along with original movie index, key=lambda x:x[1]-> on the basis of second parameter, and then selectinf index 1 to 6
    
    for Movie in movies_list:
        print(new_df.iloc[Movie[0]]['title'])
    

In [102]:
recommend('Gandhi')

Apollo 13
Gremlins 2: The New Batch
Superman IV: The Quest for Peace
The Forbidden Kingdom
Bridge of Spies


In [103]:
import pickle

In [104]:
pickle.dump(new_df,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))