In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Loading data

In [2]:
movie_data = pd.read_csv('movies.csv')
user_ratings = pd.read_csv('ratings.csv')
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Movie data processing (year extraction and title cleaning)

In [3]:
movie_data['year'] = movie_data.title.str.extract(r'\((\d{4})\)', expand=False)
movie_data['title'] = movie_data.title.str.replace(r'\(\d{4}\)', '', regex=True).str.strip()
movie_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


### Convert genres to list

In [4]:
movie_data['genres'] = movie_data.genres.str.split('|')
movie_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


### Creating a Genre Matrix (One-Hot Encoding)

In [5]:
movies_with_genres = movie_data.copy()
for index, row in movie_data.iterrows():
    for genre in row['genres']:
        movies_with_genres.at[index, genre] = 1
movies_with_genres = movies_with_genres.fillna(0)
movies_with_genres.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Processing of rating data

In [6]:
user_ratings = user_ratings.drop('timestamp', axis=1)
user_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Define user input

In [7]:
user_preferences = [
    {'title':'Grumpier Old Men', 'rating':4.5},
    {'title':'Toy Story', 'rating':3},
    {'title':'Jumanji', 'rating':2.5},
    {'title':"Pulp Fiction", 'rating':3.5},
    {'title':'Waiting to Exhale', 'rating':5}
]
rated_movies = pd.DataFrame(user_preferences)
rated_movies

Unnamed: 0,title,rating
0,Grumpier Old Men,4.5
1,Toy Story,3.0
2,Jumanji,2.5
3,Pulp Fiction,3.5
4,Waiting to Exhale,5.0


### Connecting movie information to user input

In [8]:
input_id = movie_data[movie_data['title'].isin(rated_movies['title'].tolist())]
rated_movies = pd.merge(input_id, rated_movies, on='title')
rated_movies = rated_movies.drop('genres', axis=1).drop('year', axis=1)
rated_movies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.0
1,2,Jumanji,2.5
2,3,Grumpier Old Men,4.5
3,4,Waiting to Exhale,5.0
4,296,Pulp Fiction,3.5


### Extracting a matrix of genres for user movies

In [9]:
user_rated_movies = movies_with_genres[movies_with_genres['movieId'].isin(rated_movies['movieId'].tolist())]
user_rated_movies = user_rated_movies.reset_index(drop=True)
user_genre_matrix = user_rated_movies.drop(['movieId', 'title', 'genres', 'year'], axis=1)
user_genre_matrix

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create a user profile

In [10]:
genre_preferences = user_genre_matrix.transpose().dot(rated_movies['rating'])
genre_preferences

Adventure              5.5
Animation              3.0
Children               5.5
Comedy                16.0
Fantasy                5.5
Romance                9.5
Drama                  8.5
Action                 0.0
Crime                  3.5
Thriller               3.5
Horror                 0.0
Mystery                0.0
Sci-Fi                 0.0
War                    0.0
Musical                0.0
Documentary            0.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

### Preparing a genre matrix for all films

In [11]:
all_movies_genres = movies_with_genres.set_index('movieId')
all_movies_genres = all_movies_genres.drop(['title', 'genres', 'year'], axis=1)
all_movies_genres.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating recommendation scores for each movie

In [12]:
recommendation_scores = ((all_movies_genres * genre_preferences).sum(axis=1)) / genre_preferences.sum()
recommendation_scores.head(10)

movieId
1     0.586777
2     0.272727
3     0.421488
4     0.561983
5     0.264463
6     0.115702
7     0.421488
8     0.181818
9     0.000000
10    0.148760
dtype: float64

### Sorting and selecting the best movies

In [13]:
recommendation_scores = recommendation_scores.sort_values(ascending=False)
recommendation_scores.head(20)

movieId
1907      0.793388
108540    0.793388
92348     0.743802
4306      0.743802
84637     0.743802
56152     0.743802
45672     0.743802
134853    0.727273
4719      0.727273
587       0.710744
970       0.710744
4956      0.710744
138702    0.702479
258       0.694215
148775    0.677686
144606    0.677686
3893      0.677686
1912      0.677686
2065      0.652893
2797      0.652893
dtype: float64

### Show information about recommended movies

In [14]:
movie_data.loc[movie_data['movieId'].isin(recommendation_scores.head(20).index)]

Unnamed: 0,movieId,title,genres,year
222,258,"Kid in King Arthur's Court, A","[Adventure, Children, Comedy, Fantasy, Romance]",1995
505,587,Ghost,"[Comedy, Drama, Fantasy, Romance, Thriller]",1990
743,970,Beat the Devil,"[Adventure, Comedy, Crime, Drama, Romance]",1953
1390,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama...",1998
1394,1912,Out of Sight,"[Comedy, Crime, Drama, Romance, Thriller]",1998
1530,2065,"Purple Rose of Cairo, The","[Comedy, Drama, Fantasy, Romance]",1985
2103,2797,Big,"[Comedy, Drama, Fantasy, Romance]",1988
2903,3893,Nurse Betty,"[Comedy, Crime, Drama, Romance, Thriller]",2000
3194,4306,Shrek,"[Adventure, Animation, Children, Comedy, Fanta...",2001
3460,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
