<h1 align="center">Movie Recommender</h1>

In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import PorterStemmer

## a) Reading the data files

In [2]:
movies = pd.read_csv("./data/tmdb_5000_movies.csv")
credits = pd.read_csv("./data/tmdb_5000_credits.csv")

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## b) Merge the data frames

In [5]:
# Merge both dataframes on the basis of title column
movies = movies.merge(credits,on="title")

movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## c) Keep only useful columns

In [6]:
# genere, movie_id, keywords, title, cast, crew, overview
movies = movies[["movie_id", "title", "overview" ,"genres", "cast", "crew", "keywords"]]

movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."


## 1. Preprocessing

### Remove rows that have empty column 

In [7]:
# check the empty column values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
cast        0
crew        0
keywords    0
dtype: int64

In [8]:
# (implace -> remove from the existing dataframe istead of creating new one)
movies.dropna(inplace=True)

In [9]:
# check the empty column values
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
cast        0
crew        0
keywords    0
dtype: int64

### Remove the duplicate rows

In [10]:
movies.duplicated().sum()

0

In [11]:
movies = movies.drop_duplicates()

In [12]:
movies.duplicated().sum()

0

### Genres
Make a list out of the list of objects of genres

In [13]:
movies["genres"] = movies['genres'].apply(lambda genres: [genre["name"] for genre in ast.literal_eval(genres)])


In [14]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."


### Keywords

In [15]:
movies["keywords"] = movies['keywords'].apply(lambda keywords: [keyword["name"] for keyword in ast.literal_eval(keywords)])


In [16]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[culture clash, future, space war, space colon..."


### Cast

In [17]:
movies["cast"] = movies["cast"].apply(lambda casts: [cast["name"] for cast in ast.literal_eval(casts)][:3])

In [18]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[culture clash, future, space war, space colon..."


### Crew Member

Make a list of directors only

In [19]:
movies["crew"] = movies["crew"].apply(lambda crew_members: [crew_member["name"] for crew_member in ast.literal_eval(crew_members) if crew_member["job"] == "Director"])
       

In [20]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon..."


### Overview

In [21]:
movies["overview"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [22]:
movies["overview"] = movies["overview"].apply(lambda string: string.split())

In [23]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[culture clash, future, space war, space colon..."


### Transformation

Remove the spaces from all the elements in lists before making the tags from them

In [24]:
def remove_spaces(list):
    return [element.replace(" ", "") for element in list]

In [25]:
movies["genres"] = movies["genres"].apply(remove_spaces)
movies["cast"] = movies["cast"].apply(remove_spaces)
movies["crew"] = movies["crew"].apply(remove_spaces)
movies["keywords"] = movies["keywords"].apply(remove_spaces)

In [26]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[cultureclash, future, spacewar, spacecolony, ..."


### Make Column of tags

In [27]:
movies["tags"] = movies["overview"] + movies["genres"] + movies["cast"] + movies["crew"] + movies["keywords"]

In [28]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,cast,crew,keywords,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin..."


### Remove uselesss columns

In [29]:
movies = movies[["movie_id", "title", "tags"]]

In [30]:
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


### Convert the tags list into string

In [31]:
movies["tags"] = movies["tags"].apply(lambda x: " ".join(x))
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


### Convert the tags string to lower case

In [32]:
movies["tags"] = movies["tags"].apply(lambda string: string.lower())
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


## 2. Vectorization

* Bag of words

### 2.1 Stemming

In [33]:
ps = PorterStemmer()

In [34]:
movies["tags"] = movies["tags"].apply(lambda text: " ".join([ps.stem(word) for word in text.split()]))

In [35]:
movies["tags"] = movies["tags"].apply(lambda text: " ".join([ps.stem(word) for word in text.split()]))

### 2.2 Vectorize

In [36]:
cv = CountVectorizer(max_features=5000, stop_words="english")

vectors = cv.fit_transform(movies["tags"]).toarray()

In [37]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

### 2.3 Cosine Similarity

In [38]:
similarity = cosine_similarity(vectors)

In [39]:
def recommend_movie(movie):
    movie_index = movies[movies["title"] == movie].index[0]
    distance = similarity[movie_index]
    
    movie_list = sorted(list(enumerate(distance)), reverse=True, key=lambda tupple: tupple[1])[1:6]
    return [movies.iloc[movie[0]].title for movie in movie_list]

recommend_movie("Batman Returns")


['The Dark Knight',
 'Batman',
 'The Dark Knight Rises',
 'Batman',
 'Batman Forever']

## Dump the calculated data

In [86]:
import pickle

In [None]:
pickle.dump(movies, open("movies.pkl", "wb"))
pickle.dump(similarity, open("similarities.pkl", "wb"))