In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credit = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
movies.rename(columns={'id': 'movie_id'},  inplace=True)
movies.head(1)

Unnamed: 0,budget,genres,homepage,movie_id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
merged_movies = movies.merge(credit, on='movie_id')

In [6]:
merged_movies.head(1)

Unnamed: 0,budget,genres,homepage,movie_id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


COLUMNS TO BE CONSIDERED:
1. genres (important for tags, people tend to like movies based on particular genre)
2. movie_id (for showing posters at the end)
3. keywords (important for movie desc)
4. overview (gives overview of movie)
5. title
6. cast (only top 3 actors)
7. crew (director)

In [7]:
merged_movies = merged_movies[['genres', 'movie_id', 'keywords', 'overview', 'title_x', 'cast', 'crew']]
merged_movies.rename(columns={'title_x': 'title'}, inplace=True)
merged_movies.head(1)

Unnamed: 0,genres,movie_id,keywords,overview,title,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
# Check for null values
merged_movies.isnull().sum()

genres      0
movie_id    0
keywords    0
overview    3
title       0
cast        0
crew        0
dtype: int64

In [9]:
# Drop null values (because only 3 are there)
merged_movies.dropna(inplace=True)

In [10]:
# no. of duplicates
merged_movies.duplicated().sum()

0

In [11]:
merged_movies.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
merged_movies.iloc[0]['keywords']

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [13]:
# merged_movies.iloc[0]['cast']
# merged_movies.iloc[0]['crew']

In [14]:
# first we need to convert genres string to a list of dictionaries
# we can do this by:
import ast
ast.literal_eval('[{}, {}]')

[{}, {}]

In [15]:
def extract_values(obj, limit=float('inf')):
    vals = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter >= limit:
            break
        vals.append(i['name'])
        counter += 1
    
    return vals

def extract_director(obj):
    vals = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            vals.append(i['name'])
            break
    
    return vals
    
    

### Converting column to list of words present / required values

In [16]:
merged_movies['genres'] = merged_movies['genres'].apply(extract_values)

In [17]:
merged_movies['keywords'] = merged_movies['keywords'].apply(extract_values)

In [18]:
merged_movies['cast'] = merged_movies['cast'].apply(extract_values, limit=3)

In [19]:
merged_movies['crew'] = merged_movies['crew'].apply(extract_director)

In [20]:
# Converting overview to list of words
merged_movies['overview'] = merged_movies['overview'].apply(lambda x: x.split())

In [21]:
merged_movies.head(1)

Unnamed: 0,genres,movie_id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,"[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [22]:
merged_movies['cast'][0]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

### Now we need to omit the space of each name/value and join them together to form a single tag.
### This will help in recognising tags which are of 2+ words. For e.g Sam Willson and Sam Worthington. It will help us identify correct tags, otherwise model will be confused about which sam we are talking about in this case
 

In [23]:
merged_movies['genres'] = merged_movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])

merged_movies['keywords'] = merged_movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

merged_movies['cast'] = merged_movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])

merged_movies['crew'] = merged_movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])


In [24]:
merged_movies.head(1)

Unnamed: 0,genres,movie_id,keywords,overview,title,cast,crew
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


### Now we need to concatenate the columns to make a single column named "tags" which will contain all the tags pertaining to the movie.
### These tags will help us in recognising similar movies.

In [25]:
merged_movies['tags'] = merged_movies['genres'] + merged_movies['keywords'] + merged_movies['overview'] + merged_movies['cast'] + merged_movies['crew']
merged_movies.head(1)

Unnamed: 0,genres,movie_id,keywords,overview,title,cast,crew,tags
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."


In [26]:
movies_df = merged_movies[['movie_id', 'title', 'tags']]
movies_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, c..."


In [27]:
# Converting list of tags to a single string

movies_df['tags'] = movies_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['tags'] = movies_df['tags'].apply(lambda x: " ".join(x))


In [28]:
movies_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,Action Adventure Fantasy ScienceFiction cultur...


In [29]:
movies_df['tags'][0]

'Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

## Stemming

In [30]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stemmer.stem("loving")

'love'

In [31]:
def tag_stem(s):
    l = []
    for i in s.split():
        l.append(stemmer.stem(i))
    
    return " ".join(l)

In [32]:
movies_df['tags'] = movies_df['tags'].apply(tag_stem)

movies_df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['tags'] = movies_df['tags'].apply(tag_stem)


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventur fantasi sciencefict culturecla...


## BAG OF WORDS for vectorization of text (all the tags)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(max_features=5000, stop_words='english')

In [34]:
tag_vectors = bow.fit_transform(movies_df['tags']).toarray()

In [35]:
tag_vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [36]:
tag_vectors.shape

(4800, 5000)

In [37]:
# bow.get_feature_names()

## Model Building

Now we will find similarities of movies with each other. Then we will be consider top K  movies similar to the given movie for recommendation (Just like a nearest neighbor search [KNN])

We wont use euclidean distance as a measure because in higher dimensions it mis behaves as stated in the curse of dimentionality

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

movie_sim_matrix = cosine_similarity(tag_vectors)

In [39]:
movie_sim_matrix.shape

(4800, 4800)

Now, given a movie and its similarities with other movies we have to sort the moveis according to the similarity and get top similar movies (say top 5).

We need to sort in such a way that along with the similarites we get the index of the movies because we need the index to fetch other data for the recommended movies. Hence we will use enumerate function to have indexes as well.

In [40]:
# list(enumerate(movie_sim_matrix[0]))

In [41]:
# sorted(list(enumerate(movie_sim_matrix[0])), reverse=True, key=lambda x: x[1])

In [42]:
# fetch index given title
movies_df[movies_df['title'] == 'Avatar'].index[0]

0

In [43]:
def recommend_movies(title):
    mov_index = movies_df[movies_df['title'] == title].index[0]
    similarities = movie_sim_matrix[mov_index]
    sorted_similarities = sorted(list(enumerate(similarities)), reverse=True, key=lambda x: x[1])[1:6]

    for i in sorted_similarities:
        print(movies_df.iloc[i[0]]['title'])


In [44]:
recommend_movies('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


Now we will import ( pickle.dump() ) required data and model to the web app in which we gonna use the model

In [45]:
import pickle


In [46]:
movies_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventur fantasi sciencefict culturecla...


In [51]:
# dumping/writing the movies_df (dataframe with movies) dataframe to file 
# pickle.dump(movies_df.to_dict(), open('movies_dict.pkl', 'wb') ) 

In [52]:
# dumping/writing the similarity matrix of movies to file
# pickle.dump(movie_sim_matrix, open("movie_sim_matrix.pkl", 'wb'))