In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
# movies.head(1) ->use to show the movie database
# movies.shape  -> gives the dimension of database ie. how many rows and columns
# credits.head(1)['crew'].values -> to see a particular column
# it is important to clean the data after importing as we should use only column which would make our model accurate
# we would be using here generes(tells the type of movie), id(for ui purpose as it will give posters), keywords(keyword used), title(movie name), overview(whats movie all about), cast(which actor,actress), crew(which directors)
# to check for missing values as it can decrease the accuracy for ex-> movies.isnull().sum()
# to remove those data-> movies.dropna(inplace=True)
# to check duplicate data -> movies.duplicated().sum()
#to see first row of column genre of movies -> movies.iloc[0].genres

In [4]:
movies=movies.merge(credits,on='title')  #merging the movies database with credits database with common column as title

In [5]:
movies.shape

(4809, 23)

In [6]:
#adding required columns back to movie
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [7]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [8]:
movies.dropna(inplace=True)

In [9]:
movies.duplicated().sum()

0

In [10]:
# now we would create our tag using columns genres,keywords,overview,cast,crew from these columns we would extract main keywords and make a paragraph which would be used as tag
# to convert a string to list we use ast -> import ast -> ast.literal_eval(obj) -> this will convert the string to list
# for spliting a string into list of string -> movies['overview'].apply(lambda x:x.split())

In [11]:
import ast
def convert(obj):
    list = []
    for i in ast.literal_eval(obj):    # as here the obj or genre column data is inside single quotes which is making it string instead of list so the ast.literal_eval will convert back it to string
        list.append(i['name'])
    return list    

In [12]:
movies['genres']=movies['genres'].apply(convert) # this will convert the whole genres of movie from string to list and replace the eralier value of genres with new oneas list

In [13]:
movies['keywords']=movies['keywords'].apply(convert)

In [14]:
# to extract first 3 values from the cast 
def convert3(obj):
    list = []
    counter =0
    for i in ast.literal_eval(obj): 
        if counter!=3:
            list.append(i['name'])
            counter+=1
        else:
            break
    return list   

In [15]:
movies['cast']=movies['cast'].apply(convert3)

In [16]:
# to extract first director name from the crew 
def fetch_director(obj):
    list = []
    
    for i in ast.literal_eval(obj): 
        if i['job'] == 'Director':
            list.append(i['name'])
            break
    return list   

In [17]:
movies['crew']=movies['crew'].apply(fetch_director)

In [18]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [19]:
# now we have to replace the gaps between wordsas it could decrease the model efficiency for ex if the actor name is "Sam Worthington" but if i search for sam i may also get result for crew "sam Mendes" that why we should remove sapce between keywords and make it a one word ex. samMendes
# do do it we use lambda function -> movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [20]:
 movies['genres']= movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
 movies['keywords']= movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
 movies['cast']= movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
 movies['crew']= movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [21]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [22]:
#creating a column tags (and removing other columns) which would store concatination of all 5 columns{oversview,genres,keywords,cast,crew}
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [23]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [24]:
# creating new table containing necessary components only
new_df = movies[['movie_id','title','tags']]

In [25]:
#joining the whole tags list as a complete paragraph
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))


In [26]:
# converting the whole tag to lower case 
new_df['tags'] =new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] =new_df['tags'].apply(lambda x:x.lower())


In [27]:
new_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [28]:
# now we have to vectorize the the tags :-
#-> we have to remove stop words(using sklearn library) in english ie. is ,the ,are .....
#-> then we have apply steming in the tags { to convert repeatative words into its stem word for ex. ['loving','loves','love']  --->  ['love','love','love']
#-> we have to first find most common 5000 words(we can choose more than that if wanted but lower the better)
#-> then we vectorize the whole tags {ie. every row of tags would be converted to no. of iteration of that word in that row of tag }
#-> for ex if earlier the row was 'james action hel .....' then now it would be [0 ,5 , ...] here numbers are the iterations of words in 5000 common word which is found in this row of tag column
# to see the common 5000 words  -> cv.get_feature_names_out()

In [29]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [30]:
def stem(text):
    y= []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)    

In [31]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [32]:
from sklearn.feature_extraction.text import CountVectorizer
# creating 5000 common words for reference
cv =  CountVectorizer(max_features=5000,stop_words='english')


In [33]:
# vectorizing the each row of tags to the found iteration of common word inside tags
vectors=cv.fit_transform(new_df['tags']).toarray()

In [34]:
# all the vectors are distributed in a 2-D cartesian so we would try to find the angle between(angle distance -> cosine similarity) the lines(formed by joining the origin and vector point) of the vectors.
# if the angles between the lines are less the movies are more similar.
# if angles are more then the movies are dissimilar or less similar

In [35]:
# calculating cosine similarity or angle_distance between each movie 
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors) ## this similarity matriz would have shape of (4098*4098) as similarity of each movie is calculated with each movie ,therefore an array of array forms

In [36]:
#similarity[0]  # checking similarity of first movie with all the movies {that's why similarity of 1st movie is "1" as both movies are same}
# result of similarity[0] -> array([1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,0.        ])
# result of similarity[1]  -> array([0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,0.02615329])
# you can see for similarity[0] or 1st movie(or 1st index) has 100% similar value
# and for similarity[1] or 2nd movie(2nd index) has 100% similar value
#list(enumerate(similarity[0]))  # to store value as list of {key value} where first value sate the index of movie and second value state the angular distance of other movies
                                 # if the 2nd value is one that is the searched movie itself

#sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]  #to sort the value on based of angular distance from the searched movie

In [37]:
def recommend(movie):
    movie_index =new_df[new_df['title']==movie].index[0]
    distances = similarity[movie_index]
    movies_list =sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [40]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [41]:
import pickle

In [44]:
# used to export the table as .pkl format
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))

In [45]:
pickle.dump(similarity,open('similarity.pkl','wb'))