In [2]:
import pandas as pd
import numpy as np

In [68]:
# Getting the movies data
movies = pd.read_csv("dataset/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [69]:
# Getting the movies rating by each user
ratings = pd.read_csv("dataset/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [70]:
# Getting the user details
#users = pd.read_csv("dataset/links.csv")
#users.head()

### Data Preprocessing

In [71]:
# Fixing the title and extracting year from it
movies['year'] = movies.title.str.extract("(\d\d\d\d)", expand=False)
movies.head()
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))','')
movies['title'] = movies.title.apply(lambda d: d.strip())
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [72]:
# Splitting the genres text into list of genres
movies['genres'] = movies.genres.str.split('|')
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [73]:
# Creating a column for each genres in the movieWithGenreTable

movie_with_genre = movies.copy()
movie_with_genre

for index, row in movie_with_genre.iterrows():
    for genre in row['genres']:
        movie_with_genre.at[index, genre] = 1

movie_with_genre = movie_with_genre.fillna(0)
movie_with_genre.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
# Removing the timestamp column as it will not be useful in a simple recommendation system
ratings = ratings.drop("timestamp", axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [239]:
movies[movies['year'] == '2008'].head(60)

Unnamed: 0,movieId,title,genres,year
11899,53207,88 Minutes,"[Crime, Drama, Mystery, Thriller]",2008
12202,55603,My Mom's New Boyfriend,"[Action, Comedy, Romance, Thriller]",2008
12231,55830,Be Kind Rewind,[Comedy],2008
12359,56949,27 Dresses,"[Comedy, Romance]",2008
12383,57326,In the Name of the King: A Dungeon Siege Tale,"[Action, Adventure, Fantasy]",2008
12387,57368,Cloverfield,"[Action, Mystery, Sci-Fi, Thriller]",2008
12388,57370,Mad Money,"[Comedy, Crime, Thriller]",2008
12412,57520,One Missed Call,"[Horror, Mystery, Thriller]",2008
12413,57522,First Sunday,"[Comedy, Crime]",2008
12414,57526,Untraceable,"[Crime, Thriller]",2008


In [240]:
# Lets take a user input

userInput = [
    {"title" : "Toy Story", "rating" : 5.0},
    {"title" : "Jumanji", 'rating' : 4},
    {'title' : "Paranormal ActivitY: The Marked Ones", 'rating' :3.5},
    {'title' : "Avengers, The",'rating' : 4},
    {'title' : "Dark Knight Rises, The", 'rating' :5},
    {'title' : "Insidious Chapter 3", 'rating' :4},
    {'title' : "Avengers: Age of Ultron",'rating' : 3.5},
    {'title' : "G.I. Joe: Retaliation", 'rating' :4.5},
    {"title" : "Cowboys & Aliens", "rating" : 3},
    {"title" : "Legion", "rating" :4.2},
    {"title" : "My Name is Khan", "rating" :5},
    {"title" : "Grudge 3, The", "rating": 4.5},
    {"title" : "Jodhaa Akbar", "rating" : 3.5},
    {"title" : "10,000 BC", "rating" : 2.5},
    {"title" : "300: Rise of an Empire", "rating" : 4},
    {"title" : "Olympus Has Fallen", "rating" :3},
    {"title" : "2012", "rating" : 5},
    {"title" : "Housefull 2", "rating" :3.5},
    {"title" : "Iron Man 2", "rating" :5},
    {"title" : "17 Again", "rating" : 4},
    {"title" : "Dragonball Evolution", "rating" :4.5},
    {"title" : "Doomsday", "rating": 4}
]

inputMovies = pd.DataFrame(userInput)
inputMovies.head(15)

Unnamed: 0,title,rating
0,Toy Story,5.0
1,Jumanji,4.0
2,Paranormal ActivitY: The Marked Ones,3.5
3,"Avengers, The",4.0
4,"Dark Knight Rises, The",5.0
5,Insidious Chapter 3,4.0
6,Avengers: Age of Ultron,3.5
7,G.I. Joe: Retaliation,4.5
8,Cowboys & Aliens,3.0
9,Legion,4.2


In [231]:
# Finding if the user's movie is in the dataset

inputId = movies[movies['title'].isin(inputMovies['title'].tolist())]
userMovies = pd.merge(inputId, inputMovies)
userMovies = userMovies.drop("year", axis=1).drop('genres', axis=1)
userMovies = userMovies.reset_index(drop=True)
userMovies.head(15)

Unnamed: 0,movieId,title,rating
0,1,Toy Story,5.0
1,2,Jumanji,4.0
2,2153,"Avengers, The",4.0
3,89745,"Avengers, The",4.0
4,58111,Jodhaa Akbar,3.5
5,58293,"10,000 BC",2.5
6,66066,"Grudge 3, The",4.5
7,67867,Dragonball Evolution,4.5
8,68135,17 Again,4.0
9,72378,2012,5.0


In [232]:
# Filterning out the genres of movies that user rated
user_movies_genres = movie_with_genre[movie_with_genre['movieId'].isin(userMovies['movieId'].tolist())]
user_movies_genres = user_movies_genres.drop("title", axis=1).drop("genres", axis=1).drop("movieId", axis=1).drop("year", axis=1)
user_movies_genres = user_movies_genres.reset_index(drop=True)
user_movies_genres.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [233]:
userMovies['rating']

0     5.0
1     4.0
2     4.0
3     4.0
4     3.5
5     2.5
6     4.5
7     4.5
8     4.0
9     5.0
10    4.2
11    5.0
12    5.0
13    3.0
14    5.0
15    3.5
16    4.5
17    3.0
18    4.0
19    4.0
20    3.5
Name: rating, dtype: float64

#### User preference matrix

In [234]:
userProfile = user_movies_genres.transpose().dot(userMovies['rating'])
userProfile

Adventure             42.0
Animation              5.0
Children               9.0
Comedy                13.0
Fantasy               18.5
Romance               15.2
Drama                 20.7
Action                50.5
Crime                  5.0
Thriller              34.0
Horror                 7.5
Mystery                0.0
Sci-Fi                29.5
IMAX                  26.5
Documentary            0.0
War                    8.0
Musical                4.0
Western                5.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [235]:
movie_genre = movie_with_genre.set_index(movie_with_genre['movieId'])
movie_genre = movie_genre.drop("movieId", axis=1).drop("title", axis=1).drop("genres", axis=1).drop("year", axis=1)
movie_genre

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
151703,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
151709,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [236]:
# Multiply the movie_genre table with user Profile and divide by the sum of userProfile to get weighed average
recommendation_table = (movie_genre*userProfile).sum(axis=1)/userProfile.sum()
recommendation_table.head()

movieId
1    0.298228
2    0.236878
3    0.096115
4    0.166667
5    0.044308
dtype: float64

In [252]:
# Get movies with highest values to recommend the user
recommendation_table = recommendation_table.sort_values(ascending=False)
recommendation_table.head(10)

movieId
71999     0.717110
115479    0.646558
49593     0.626789
52722     0.622018
6934      0.622018
101076    0.622018
77561     0.622018
6365      0.622018
117646    0.609066
81132     0.605658
dtype: float64

In [251]:
# Top 10 recommended movies -- 
from datetime import datetime
recommended_movies = movies[movies['movieId'].isin(recommendation_table.head(10).keys())]
recommended_movies = recommended_movies[~recommended_movies['title'].isin(userMovies['title'].tolist())]
recommended_movies['year'] = pd.to_datetime(recommended_movies['year'])
recommended_movies[recommended_movies['year'] >= '2000']

Unnamed: 0,movieId,title,genres,year
6261,6365,"Matrix Reloaded, The","[Action, Adventure, Sci-Fi, Thriller, IMAX]",2003-01-01
6823,6934,"Matrix Revolutions, The","[Action, Adventure, Sci-Fi, Thriller, IMAX]",2003-01-01
11838,52722,Spider-Man 3,"[Action, Adventure, Sci-Fi, Thriller, IMAX]",2007-01-01
16055,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010-01-01
25218,117646,Dragonheart 2: A New Beginning,"[Action, Adventure, Comedy, Drama, Fantasy, Th...",2000-01-01
