# Recommendation system for movies

In [1]:
# Activate auto reload features

%load_ext autoreload
%autoreload 2


## 1. Prepare data

In [2]:
import pandas as pd
import os

### Load the csv rating data

In [3]:
DATA_DIRECTORY = 'data'
CSV_PATH = os.path.join(DATA_DIRECTORY, 'ratings.csv')
rating_data = pd.read_csv(CSV_PATH)

### Preview the data

In [4]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
rating_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
userId       100004 non-null int64
movieId      100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


### Here you will work on preparing the data

#### For example:
1. Check for invalid values such as nulls, NaNs.
2. Decide what to do with any invalid value them, Delete them? fill them with something as the mean value?
3. Transform data in anyway needed

In [6]:
# Drop timestamp column
rating_data = rating_data.drop(columns=['timestamp', ])

### Examine data

#### How many records?

In [7]:
len(rating_data)

100004

#### How many unique users are there?

In [8]:
len(pd.unique(rating_data['userId']))

671

#### How many unique movies are there?

In [9]:
len(pd.unique(rating_data['movieId']))

9066

### Load the CSV Movie Metadata

In [10]:
CSV_META_PATH = os.path.join(DATA_DIRECTORY, 'movies.csv')
metadata = pd.read_csv(CSV_META_PATH, index_col=0)

#### Preview Movie Metada

In [11]:
metadata.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [12]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9125 entries, 1 to 164979
Data columns (total 2 columns):
title     9125 non-null object
genres    9125 non-null object
dtypes: object(2)
memory usage: 213.9+ KB


### Function to extract movie metadata

In [13]:
def movie_meta(movie_id):
    title = metadata.at[movie_id, 'title']
    genres = metadata.at[movie_id, 'genres']
    return title, genres,

In [14]:
movie_meta(2)

('Jumanji (1995)', 'Adventure|Children|Fantasy')

#### Function to get top_n most rated skills for an specific user id

In [15]:
def favorite_movies(user_id, top_n_fav):
    user_ratings = rating_data.loc[rating_data['userId'] == user_id]
    sorted_ratings = pd.DataFrame.sort_values(user_ratings, ['rating'], ascending=[0])[:top_n_fav]
    sorted_ratings['MetaData'] = sorted_ratings['movieId'].apply(movie_meta)
    return sorted_ratings

In [16]:
favorite_movies(85, 5)

Unnamed: 0,userId,movieId,rating,MetaData
13184,85,2,5.0,"(Jumanji (1995), Adventure|Children|Fantasy)"
13217,85,255,5.0,"(Jerky Boys, The (1995), Comedy)"
13287,85,648,5.0,"(Mission: Impossible (1996), Action|Adventure|..."
13278,85,589,5.0,"(Terminator 2: Judgment Day (1991), Action|Sci..."
13262,85,480,5.0,"(Jurassic Park (1993), Action|Adventure|Sci-Fi..."


####  Most rated movies by user (Rate count)

In [17]:
movies_per_user = rating_data['movieId'].value_counts()
# Just for preview the name instead of the Skill Id
rating_data['movieId'].apply(movie_meta).value_counts().head(20)

(Forrest Gump (1994), Comedy|Drama|Romance|War)                                                       341
(Pulp Fiction (1994), Comedy|Crime|Drama|Thriller)                                                    324
(Shawshank Redemption, The (1994), Crime|Drama)                                                       311
(Silence of the Lambs, The (1991), Crime|Horror|Thriller)                                             304
(Star Wars: Episode IV - A New Hope (1977), Action|Adventure|Sci-Fi)                                  291
(Jurassic Park (1993), Action|Adventure|Sci-Fi|Thriller)                                              274
(Matrix, The (1999), Action|Sci-Fi|Thriller)                                                          259
(Toy Story (1995), Adventure|Animation|Children|Comedy|Fantasy)                                       247
(Schindler's List (1993), Drama|War)                                                                  244
(Terminator 2: Judgment Day (1991), Action|Sci

#### Users that has rated more movies

In [18]:
user_per_movie = rating_data['userId'].value_counts()
user_per_movie.head(15)

547    2391
564    1868
624    1735
15     1700
73     1610
452    1340
468    1291
380    1063
311    1019
30     1011
294     947
509     923
580     922
213     910
212     876
Name: userId, dtype: int64

#### Optional: Consider cleaning the data by removing users or movies that does not have much rate data

In [19]:
REQUIRED_RATINGS = 3
# Only Skills that have been rated more than REQUIRED_RATING times
rating_data = rating_data.loc[rating_data['movieId'].isin(movies_per_user[movies_per_user >= REQUIRED_RATINGS].index)]

In [20]:
rating_data = rating_data.loc[rating_data['userId'].isin(user_per_movie[user_per_movie >= REQUIRED_RATINGS].index)]

## 2. Optional try it with yourself

### What movies do you like?

In [21]:
metadata.loc[metadata['title'].str.contains('Life Is')]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2324,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War
32943,Life Is Sweet (1990),Comedy|Drama
130970,George Carlin: Life Is Worth Losing (2005),Comedy
155611,Life Is Sacred (2014),Documentary


In [22]:
newdata = pd.DataFrame([[700, 260, 3], [700, 1196, 5],
                       [700, 1210, 4], [700, 2628, 4],
                       [700, 2324, 5], [700, 98124, 4],
                       [700, 99813, 5], [700, 33794, 4],
                       [700, 115617, 5], [700, 134853, 4],
                       [700, 5349, 5], [700, 8636, 4],
                       [700, 2028, 5], [700, 61132, 4],
                       [700, 68954, 5], [700, 60069, 5],
                       [700, 788, 1], [700, 8961, 3],
                       [700, 30749, 4], [700, 97225, 4],
                       [700, 4306, 3], [700, 45666, 5],
                       [700, 4816, 4], [700, 4995, 5],
                       [700, 109487, 5], [700, 64695, 5]], columns=['userId', 'movieId', 'rating'])

In [23]:
rating_data = rating_data.append(newdata, ignore_index=True)
rating_data.tail()

Unnamed: 0,userId,movieId,rating
94558,700,45666,5.0
94559,700,4816,4.0
94560,700,4995,5.0
94561,700,109487,5.0
94562,700,64695,5.0


## 3. Train model with rating data

In [24]:
from surprise import Reader, Dataset

reader = Reader()
rating_dataset = Dataset.load_from_df(rating_data, reader=reader)
train_set = rating_dataset.build_full_trainset()

In [25]:
from models import KnnUB, KnnIB, SVDRecommender

user_based_model = KnnUB()
movies_based_model = KnnIB()
svd_based_model = SVDRecommender()

In [26]:
user_based_model.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<models.KnnUB at 0x250ab78bc50>

In [27]:
movies_based_model.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<models.KnnIB at 0x250aa8ff588>

In [28]:
svd_based_model.fit(train_set)

<models.SVDRecommender at 0x250ab3b8780>

## 3. Test Model Recommendations

In [29]:
n = 10
test_user = 700
favorite_movies(test_user, 30)

Unnamed: 0,userId,movieId,rating,MetaData
94562,700,64695,5.0,(Sword of the Stranger (Sutorejia: Mukô hadan)...
94552,700,60069,5.0,"(WALL·E (2008), Adventure|Animation|Children|R..."
94561,700,109487,5.0,"(Interstellar (2014), Sci-Fi|IMAX)"
94560,700,4995,5.0,"(Beautiful Mind, A (2001), Drama|Romance)"
94541,700,2324,5.0,"(Life Is Beautiful (La Vita è bella) (1997), C..."
94543,700,99813,5.0,"(Batman: The Dark Knight Returns, Part 2 (2013..."
94558,700,45666,5.0,"(Nacho Libre (2006), Comedy)"
94545,700,115617,5.0,"(Big Hero 6 (2014), Action|Animation|Comedy)"
94547,700,5349,5.0,"(Spider-Man (2002), Action|Adventure|Sci-Fi|Th..."
94549,700,2028,5.0,"(Saving Private Ryan (1998), Action|Drama|War)"


### Test collaborative filtering based on user

In [30]:
recommendations = user_based_model.get_top_n_recommendations(test_user, n=n)
for movie_id, rate_sum in recommendations:
        print(movie_meta(movie_id), rate_sum)

('Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy') 4.2
('Fargo (1996)', 'Comedy|Crime|Drama|Thriller') 4.0
("Schindler's List (1993)", 'Drama|War') 3.5999999999999996
('Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 'Mystery|Sci-Fi|Thriller') 3.2
('Jurassic Park (1993)', 'Action|Adventure|Sci-Fi|Thriller') 3.0
('Forrest Gump (1994)', 'Comedy|Drama|Romance|War') 2.8
('Rock, The (1996)', 'Action|Adventure|Thriller') 2.8
('Pulp Fiction (1994)', 'Comedy|Crime|Drama|Thriller') 2.7
('Ransom (1996)', 'Crime|Thriller') 2.6
('Clueless (1995)', 'Comedy|Romance') 2.6
("Things to Do in Denver When You're Dead (1995)", 'Crime|Drama|Romance') 2.6


### Test collaborative filtering based on movies

In [31]:
recommendations = movies_based_model.get_top_n_recommendations(test_user, n=n)
for movie_id, rate_sum in recommendations:
        print(movie_meta(movie_id), rate_sum)

('Judge, The (2014)', 'Drama') 9.944356441656772
('Wimbledon (2004)', 'Comedy|Romance') 9.934491755444562
('How to Train Your Dragon 2 (2014)', 'Action|Adventure|Animation') 9.892940264634786
('World Trade Center (2006)', 'Drama') 9.876156435355753
("Kiki's Delivery Service (Majo no takkyûbin) (1989)", 'Adventure|Animation|Children|Drama|Fantasy') 9.875939879297228
('Watch, The (2012)', 'Comedy|Sci-Fi') 9.87536537851153
('Fury (2014)', 'Action|Drama|War') 9.874972783493208
('Warrior (2011)', 'Drama') 9.872916337066046
('Impossible, The (Imposible, Lo) (2012)', 'Drama|Thriller') 9.870377147524998
('Spanglish (2004)', 'Comedy|Drama|Romance') 9.870040482511389
('The Fault in Our Stars (2014)', 'Drama|Romance') 9.852833257721327


### Test matrix factorization model

In [32]:
recommendations = svd_based_model.get_top_n_recommendations(test_user, n=n)
for movie_id, rate_sum in recommendations:
        print(movie_meta(movie_id), rate_sum)

('Fargo (1996)', 'Comedy|Crime|Drama|Thriller') 5
('On the Waterfront (1954)', 'Crime|Drama') 4.975781122902713
('Raging Bull (1980)', 'Drama') 4.935805798411434
('Godfather, The (1972)', 'Crime|Drama') 4.9305419561151265
('Pulp Fiction (1994)', 'Comedy|Crime|Drama|Thriller') 4.90567548720376
('Maltese Falcon, The (1941)', 'Film-Noir|Mystery') 4.90506032591284
('Usual Suspects, The (1995)', 'Crime|Mystery|Thriller') 4.894701141109956
('Modern Times (1936)', 'Comedy|Drama|Romance') 4.883693608766647
('12 Angry Men (1957)', 'Drama') 4.859791029413953
('Casablanca (1942)', 'Drama|Romance') 4.857397982061263


### Save the models

In [33]:
from surprise import dump

dump.dump('user_based', algo=user_based_model)
dump.dump('movie_based', algo=movies_based_model)
dump.dump('svd_based', algo=svd_based_model)

### Test saved model recommendations

In [34]:
_, loaded_model = dump.load('user_based')
recommendations = loaded_model.get_top_n_recommendations(test_user, n=n)
for movie_id, rate_sum in recommendations:
        print(movie_meta(movie_id), rate_sum)

('Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy') 4.2
('Fargo (1996)', 'Comedy|Crime|Drama|Thriller') 4.0
("Schindler's List (1993)", 'Drama|War') 3.5999999999999996
('Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 'Mystery|Sci-Fi|Thriller') 3.2
('Jurassic Park (1993)', 'Action|Adventure|Sci-Fi|Thriller') 3.0
('Forrest Gump (1994)', 'Comedy|Drama|Romance|War') 2.8
('Rock, The (1996)', 'Action|Adventure|Thriller') 2.8
('Pulp Fiction (1994)', 'Comedy|Crime|Drama|Thriller') 2.7
('Ransom (1996)', 'Crime|Thriller') 2.6
('Clueless (1995)', 'Comedy|Romance') 2.6
("Things to Do in Denver When You're Dead (1995)", 'Crime|Drama|Romance') 2.6
