Installing libraries

In [None]:
# Libraries 
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install streamlit
!pip install pickle


Importing Libraries

In [2]:
# Importing Libraries 
import numpy as np
import pandas as pd
import ast
import nltk
import pickle
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)


In [3]:
#Data
credits =pd.read_csv('../dataSet/TMDB 5000/tmdb_5000_credits.csv')
movies=pd.read_csv('../dataSet/TMDB 5000/tmdb_5000_movies.csv')

In [4]:
# Merge dataSet
df=movies.merge(credits,on='title')

# Drop columns
df=df[['movie_id','genres', 'keywords', 'title', 'overview', 'cast', 'crew']]

# Missing values
df.isnull().sum()

# Drop null values
df.dropna(inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   genres    4806 non-null   object
 2   keywords  4806 non-null   object
 3   title     4806 non-null   object
 4   overview  4806 non-null   object
 5   cast      4806 non-null   object
 6   crew      4806 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.4+ KB


In [6]:
# Function to extract 'name' data and store as a list from keywords and genres
def Tags(obj):
    list=[]
    for i in ast.literal_eval(obj):
        list.append(i['name'])
    return list

# Function to extract  Actor  Name for first 3 Character
def Actor(obj):
    list=[]
    counter =0
    for i in ast.literal_eval(obj):
        if counter !=3:
            list.append(i['name'])
            counter+=1
        else:
            break
    return list

# Function to extract Director Name
def Director(obj):
    list=[]
    for i in ast.literal_eval(obj):
        if i['job'] =='Director':
            list.append(i['name'])
            break
    return list


df['genres']=df['genres'].apply(Tags)
df['keywords']=df['keywords'].apply(Tags)
df['cast']=df['cast'].apply(Actor)
df['crew']=df['crew'].apply(Director)

In [7]:
# Make overView a  list from String 
df['overview']=df['overview'].apply(lambda x:x.split())

In [8]:
# Remove the space from each word Example James Cameroon = JamesCameroon
df['genres']=df['genres'].apply(lambda x:[i.replace(" ","") for i in x])
df['keywords']=df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
df['cast']=df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
df['crew']=df['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [9]:
# Make one Single Columns
df['tags']=df['overview']+ df['cast'] + df['crew'] + df['keywords']

# Concatenate them
new_df=df[['movie_id','title', 'tags']]

# Convert tags from list to String
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

# UpperCase to LowerCase of Tags
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [10]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [11]:
# Convert tags to Vectors 
cv=CountVectorizer(max_features = 5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

In [12]:
# Stemming
ps=PorterStemmer()

# Function performing Stemming on each tags 
def stemming(text):
    list=[]

    for i in text.split():
        list.append(ps.stem(i))
    return " ".join(list)

# Change for all the values
new_df['tags']=new_df['tags'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stemming)


In [49]:


similarity = cosine_similarity(vectors)

def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(enumerate(similarity[index]), reverse=True, key=lambda x: x[1])

    for i in distances[1:6]:
        recommended_movie = new_df.iloc[i[0]].title
        similarity_score = i[1]
        print(f"{recommended_movie} (Cosine Similarity: {similarity_score:.3f})")






In [50]:
# Example usage
recommend('Monsters, Inc.')

The Kids Are All Right (Cosine Similarity: 0.274)
Monsters University (Cosine Similarity: 0.273)
The Harvest (La Cosecha) (Cosine Similarity: 0.262)
The White Ribbon (Cosine Similarity: 0.239)
Return to Never Land (Cosine Similarity: 0.239)


In [30]:
similarity = cosine_similarity(vectors)

def recommend2(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

In [46]:
recommend2('Monsters, Inc.')

The Kids Are All Right
Monsters University
The Harvest (La Cosecha)
The White Ribbon
Return to Never Land


In [328]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [333]:
# Saving the DataFrame using Pickle in the specified location


file_path='/Users/kavach/Documents/Dev/Jupyter/Movie RecSys/website/movies.pkl'

with open(file_path,'wb') as file:
    pickle.dump(new_df, file)


In [335]:
# Save the file as Dict, because streamlit is giving errors

file_path='/Users/kavach/Documents/Dev/Jupyter/Movie RecSys/website/movies_dict.pkl'

with open(file_path,'wb') as file:
    pickle.dump(new_df.to_dict,file)

In [331]:
new_df['title'].shape

(4806,)

In [345]:
file_path = '/Users/kavach/Documents/Dev/Jupyter/Movie RecSys/website/similarity.pkl'

# Opening the file and pickling the object
with open(file_path, 'wb') as file:
    pickle.dump(similarity, file)