In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("./Dataset/tmdb_5000_movies.csv")
credits = pd.read_csv("./Dataset/tmdb_5000_credits.csv")

In [3]:
movies = movies.merge(credits, on='title')

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [5]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [7]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [9]:
movies.dropna(inplace=True)

In [10]:
import ast 
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L
        
def fetch_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L
        
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [11]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [12]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(fetch_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

In [13]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [14]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [15]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [16]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [17]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [18]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


In [19]:
new['tags'] = new['tags'].apply(lambda x: x.lower())

## Text vectorization

In [20]:
# First we will do stemming
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [21]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [22]:
new['tags'] = new['tags'].apply(stem)

In [23]:
## Here we will do text vectorization
## Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [24]:
vectors = cv.fit_transform(new['tags']).toarray()
vectors.shape

(4806, 5000)

In [25]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      shape=(5000,), dtype=object)

In [26]:
# now we will find cosine similarity between vectors
# cosine similarity is used to find similarity between two vectors
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [27]:
similarity.shape

(4806, 4806)

In [28]:
# this shows similarity of first movie with all other movies including itself which is 1
similarity[0]

array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
       0.        ], shape=(4806,))

In [29]:
def recommend(movie):
    movie_index = new[new['title'] == movie].index[0]
    movie_similarity_with_other_movies = similarity[movie_index]
    
    movie_similarity_list_based_on_index = list(enumerate(movie_similarity_with_other_movies))
    # here we are sorting based on similarity score
    top_five_similar_movies = sorted(movie_similarity_list_based_on_index, reverse=True, key=lambda x: x[1])[1:6]
    
    for i in top_five_similar_movies:
        print(new.iloc[i[0]].title) 
    
    

In [30]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [32]:
import pickle
pickle.dump(new.to_dict(), open('movies_dict.pkl','wb'))
pickle.dump(similarity, open('similarity.pkl','wb'))