In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# functions
# movies.info()
# movies.head()
# movies.head(1)


## Preprocessing

In [3]:
movies = movies.merge(credits,on='title')
movies.shape[1]

23

In [4]:
# selecting useful features for our model out of 23 features.
movies = movies[['movie_id','title','genres','keywords','overview','cast','crew']]
movies = movies[:][0:1761]

# movies.head(1)

In [5]:
movies.isnull().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    0
cast        0
crew        0
dtype: int64

In [6]:
movies.dropna(inplace=True)

In [7]:
movies.duplicated().sum()

0

In [8]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [9]:
# ast.literal_eval(obj)  ---> this function will convert string obj to list 
def convert(obj):
    list = []
    for i in ast.literal_eval(obj):
        list.append(i['name'])
    return list

In [10]:
movies['genres'] = movies['genres'].apply(convert)

In [11]:
movies['keywords'] = movies['keywords'].apply(convert)

In [12]:
movies.head(1)

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [13]:
def fetch_director(obj):
    list = []
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            list.append(i['name']);
            break;
    return list

In [14]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [15]:
movies.head(1)

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",[James Cameron]


In [16]:
def fetch_cast(obj):
    list = [];
    counter = 1;
    for i in ast.literal_eval(obj):
        if(counter<=5):
            list.append(i['name'])
            counter += 1
    return list

In [17]:
movies['cast'] = movies['cast'].apply(fetch_cast)

In [18]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [19]:
movies.head(1)

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]


In [20]:
# Concatenate the words for better results
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [21]:
movies.head(1)

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]


In [22]:
# concatenate the columns to make feature space
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [23]:
movies = movies[['movie_id','title','tags']]

In [24]:
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [25]:
movies['tags'] = movies['tags'].apply(lambda x:" ".join(x))

In [26]:
movies['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez JamesCameron'

In [27]:
movies['tags'] = movies['tags'].apply(lambda x: x.lower())
movies['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez jamescameron'

In [28]:
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


### Stemming

In [29]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [30]:
movies['tags'] = movies['tags'].apply(stem)

## Bag of Words

In [31]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [32]:
vectors = cv.fit_transform(movies['tags']).toarray()

In [33]:
for feature in cv.get_feature_names_out():
    print(feature)

000
007
10
11
12
13
14
15
150
16
16th
17
17th
1863
18th
18thcenturi
19
1910
1920
1930
1940
1950
1950s
1960
1970
1972
1980
1985
1995
1999
19th
19thcenturi
20
200
2001
20th
21st
21stcenturi
22nd
23
24
25
30
300
3d
40
400
47
50
500
50cent
60
60s
70
aaron
aaroneckhart
aarontaylor
abandon
abbiecornish
abduct
abigailbreslin
abil
abl
aboard
abram
abus
academi
accept
access
accid
accident
accompani
accomplish
account
accus
ace
act
action
actionhero
activ
activist
actor
actress
actual
adam
adammckay
adamsandl
adamscott
adamshankman
adapt
add
addict
adjust
admir
admit
adolesc
adopt
adoptivefath
ador
adrienbrodi
adrift
adult
adulteri
adulthood
advanc
adventur
adventure
adventures
advertis
advic
advis
affair
affect
afghanistan
afraid
africa
african
aftercreditssting
afterlif
aftermath
ag
agbaj
age
agenc
agency
agent
ago
agre
ahead
aid
aidanquinn
ail
aim
air
airborn
aircraft
airplan
airplanecrash
airport
aka
al
alabama
alanalda
alanarkin
alanrickman
alantudyk
alaska
albert
albertbrook
albertfinney


In [34]:
similarity_vector = cosine_similarity(vectors)

In [35]:
similarity_vector.shape

(1761, 1761)

In [36]:
def recommend_movies(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity_vector[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(movies.iloc[i[0]].title)

In [37]:
recommend_movies('Titanic')

The Notebook
Poseidon
Pirates of the Caribbean: On Stranger Tides
Master and Commander: The Far Side of the World
Rent


In [38]:

import pickle

In [39]:
pickle.dump(movies,open('movies.pkl','wb'))

In [40]:
pickle.dump(similarity_vector,open('similarity_vector.pkl','wb'))