**Modeling**

Import the necessary packages.

In [2]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import pickle

Loading the data

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credit.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


Merging the data on the basis of the movie title.

In [5]:
movies=movies.merge(credit, on='title')

keeping required columns only:

title 

genres

id

keyword

overview

cast

crew



In [7]:
movies = movies[['movie_id', 'title','overview','genres','keywords', 'cast', 'crew']]

Checking and removing null values

In [8]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [9]:
movies.dropna(inplace=True)

creating and applying a functions to tranform the data inrequired format(i.e. list of words)

In [12]:
# to convert in to list of words
def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l


In [14]:
# to get the list of name of first 3 cast members
def convert3(obj):
    l=[]
    count=0
    for i in ast.literal_eval(obj):
        if count!=3:
            l.append(i['name'])
            count+=1
        else:
            break
    return l


In [16]:
# to get name of director
def director(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l.append(i['name'])
            return l
    return 'Not Available'

In [13]:
movies['keywords'] = movies['keywords'].apply(convert)

In [15]:
movies['cast'] = movies['cast'].apply(convert3)

In [17]:
movies['crew'] = movies['crew'].apply(director)

In [18]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [21]:
movies['genres'] = movies['genres'].apply(convert)

In [22]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


removing blank spaces from the list of words

In [23]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(' ','') for i in x])
movies['overview'] = movies['overview'].apply(lambda x:[i.replace(' ','') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(' ','') for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(' ','') for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(' ','') for i in x])

merging  columns into a single column
thus creating tags column

In [25]:
movies['tags']=movies['keywords']+movies['cast']+movies['crew']+movies['genres']+movies['overview']

In [26]:
df=movies[['movie_id','title','tags']] 
df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[cultureclash, future, spacewar, spacecolony, ..."
1,285,Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad..."
2,206647,Spectre,"[spy, basedonnovel, secretagent, sequel, mi6, ..."
3,49026,The Dark Knight Rises,"[dccomics, crimefighter, terrorist, secretiden..."
4,49529,John Carter,"[basedonnovel, mars, medallion, spacetravel, p..."


joining all the tags to form a single string

In [27]:
df['tags']=df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags']=df['tags'].apply(lambda x: " ".join(x))


In [28]:
df['tags']=df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags']=df['tags'].apply(lambda x: x.lower())


In [29]:
df['tags'][0]

'cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron action adventure fantasy sciencefiction in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

creating vectorizing object to vectroize the tags excluding stop wrods of english language
this function selects most frequent words from all the tags
max_features= no. of most frequent words to be selected

In [30]:

cv=CountVectorizer(max_features=5000,stop_words='english')

steming all the words to reduce the repetation of words
example: love== "love","loved","loving","loves"

In [33]:
ps = PorterStemmer() 
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [34]:
df['tags'] = df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)


this function will check in a tag if the word is present in top frequent words or not and how many times it is present
for example : if alien is in most frequent words and for a movie alien is present in 4 times then it 4 will be the value of alien in the tag

In [35]:
vectors = cv.fit_transform(df['tags']).toarray()

this process is called vectorization
each movie is a vector in 5000 dimensionality
and the vector closest to a vector is the most similar movie

In [36]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

most frequent words are :

In [37]:
cv.get_feature_names()



['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '17th',
 '18',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1940',
 '1944',
 '1950',
 '1950s',
 '1960',
 '1960s',
 '1970',
 '1970s',
 '1971',
 '1974',
 '1976',
 '1980',
 '1985',
 '1990',
 '1999',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2003',
 '2009',
 '20th',
 '21st',
 '23',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '70',
 '80',
 'aaron',
 'aaroneckhart',
 'abandon',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'aboard',
 'abov',
 'abus',
 'academ',
 'academi',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'account',
 'accus',
 'ace',
 'achiev',
 'acquaint',
 'act',
 'action',
 'actionhero',
 'activ',
 'activist',
 'activities',
 'actor',
 'actress',
 'actual',
 'ad',
 'adam',
 'adamsandl',
 'adamshankman',
 'adapt',
 'add',
 'addict',
 'adjust',
 'admir',
 'admit',
 'adolesc',
 'adopt',
 'ador',
 'adrienbrodi',
 'adult'

we determine the distance between two vectors using cosine similarity
i.e.
the cosine of the angle between two vectors
this is called cosine similarity

In [39]:
similarity = cosine_similarity(vectors)

creating the final modeled function

In [40]:
def recommend(movie):
   index=df[df['title']== movie].index[0]
   distances = similarity[index]
   movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
   for i in movies_list:
       print(df.iloc[i[0]].title)

testing the model

In [41]:
recommend('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
10th & Wolf


dumpnig data to be used in app.py

In [43]:
pickle.dump(df.to_dict(),open('movie_dict.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))