In [59]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

Loading movies from the input as there are two files of metadata

In [2]:
movies=pd.read_csv('./input/tmdb_5000_movies.csv')
credits=pd.read_csv('./input/tmdb_5000_credits.csv')

In [3]:
movies.head(2)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


merging both the file to create a single dataframe

In [5]:
df=movies.merge(credits,on='title')
df.shape

(4809, 23)

There are multiple columns which are not an factor affecting the tag creation or we can say not an useful feature for tag creation and this is manually chosen.

#### The list of column we are keeping are : (As its a content based recommendation system) and these will be usefull for creating the tags
<ol>
<li>genres</li>
<li>id</li>
<li>keywords</li>
<li>title</li>
<li>overview</li>
<li>cast</li>
<li>crew</li>
</ol>

Extracting the columns needed from the merged dataframe

In [6]:
df=df[['movie_id','title','overview','genres','keywords','cast','crew']]
df.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


##### Checking for missing value

In [7]:
df.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [8]:
# dropping rows with null values
df.dropna(inplace=True)

In [10]:
#  cheching for the duplicates
df.duplicated().sum()

0

Extracting the genres from the genre dictionary for every row using the lambda function 

In [16]:
func =lambda  x: [i['name'] for i in ast.literal_eval(x)]
df['genres'] =df['genres'].apply(func)

Extracting the keywords from the keywords dictionary for every row using the lambda function as provided data is in the dict form

In [18]:
df['keywords'] = df['keywords'].apply(func)

Extracting the name of the casts from the cast dictionary for every row using the lambda function 

In [23]:
func2 =lambda  x: [i['name'] for i in ast.literal_eval(x)][:3]
df['cast']=df['cast'].apply(func2)

Extracting the name of the director for each movie

In [27]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L
df['crew']=df['crew'].apply(fetch_director)

Now going to ceate the tag for each movie.
<ul>
<li>At first creating the list from the text in overview column</li>
<li> Then some text processing like merging the words for keywords , cast,crew column because some of the nomes are like 'Tom Harfield' , 'Tom Cruise' so while maching the tags in further process 'Tom' will match in both and will increase the tendendancy to match so joining both the word will cause two completely different names.</li>
<li>Then creating a column tags in which all the features like overview,genres,keywords,cast,crew are joined and created a single long text</li>
<li>Later lowering all the text</li>
<li>After that stemming is performed in which similar word like "actions","action" were not matching but after stemming both will match. </li>
</ul>

In [28]:
df['overview']=df['overview'].apply(lambda x: x.split())

In [32]:
func3=lambda x: [i.replace(" ","") for i in x]
df['keywords']=df['keywords'].apply(func3)
df['cast']=df['cast'].apply(func3)
df['crew']=df['crew'].apply(func3)

In [34]:
df['tags']= df['overview']+df['genres']+df['keywords'] + df['cast']+df['crew']

In [35]:
new_df = df[['movie_id','title','tags']]

In [36]:
new_df['tags']=new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x: " ".join(x))


In [37]:
new_df['tags']=new_df['tags'].apply(lambda x : x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x : x.lower())


we have a problem here is in the list of features we have words like 'actions' and 'action' but we want both that to be same so we will apply stemming

In [50]:
ps=PorterStemmer()
def stemming(text):
    y = ""
    for i in text.split():
        y=y+ps.stem(i)+" "
    return y

In [51]:
new_df['tags']=new_df['tags'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stemming)


#### Performing Text Vectorization using Bag of words technique
<p>In this technique creating vector for each tag in the tag column then we try to recommend the movies with  closest vector for any movie</p>
At first using the 'CountVectorizer' library creating an 5000 dimensional vector and the way was we found out the most frequent 5000 words in the joined text of all the tags and those 5000 words are considered as vectors.
<p>vector for ecach tag is like for every ith word from the vector if there is any same  ith word in  tag of the movie then the count will be the scalar value of that dimension of the vector and the same is performed for each dimension(word matching from tags) is performed </p>
<p>By this way the vector for each tags is created</p>  

In [54]:
cv=CountVectorizer(max_features=5000,stop_words='english')

In [55]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [58]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

For higher dimension vector the euclidean distance is a curse i.e. not a reliable measure thus we will apply cosine distance basically calculating the angle difference thus we get the relationa as <b>distace inversely related to relation between two movies</b> between two vectors and find out the top five least distance vector for a movie

In [60]:
similarity =cosine_similarity(vectors)

In [66]:
def recommend(movie):
    movie_index = new_df[new_df['title']==movie].index[0]
    distances= similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True, key = lambda x: x[1] )[1:6]
    movies_list=[new_df.iloc[i[0]].title for i in movies_list]
    return movies_list     