# Recommendation system for movies

In [1]:
# Activate auto reload features

%load_ext autoreload
%autoreload 2


## 1. Prepare data

In [2]:
import pandas as pd
import os

### Load the csv rating data

In [6]:
DATA_DIRECTORY = 'data'
CSV_PATH = os.path.join(DATA_DIRECTORY, 'ratings.csv')
rating_data = pd.read_csv(CSV_PATH)

### Preview the data

In [7]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [8]:
rating_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
userId       100004 non-null int64
movieId      100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


### Here you will work on preparing the data

#### For example:
1. Check for invalid values such as nulls, NaNs.
2. Decide what to do with any invalid value them, Delete them? fill them with something as the mean value?
3. Transform data in anyway needed

In [53]:
# Drop timestamp column
rating_data = rating_data.drop(columns=['timestamp', ])

### Examine data

#### How many records?

In [11]:
len(rating_data)

100004

#### How many unique users are there?

In [9]:
len(pd.unique(rating_data['userId']))

671

#### How many unique movies are there?

In [12]:
len(pd.unique(rating_data['movieId']))

9066

### Load the CSV Movie Metadata

In [20]:
CSV_META_PATH = os.path.join(DATA_DIRECTORY, 'movies.csv')
metadata = pd.read_csv(CSV_META_PATH, index_col=0)

#### Preview Movie Metada

In [21]:
metadata.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [22]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9125 entries, 1 to 164979
Data columns (total 2 columns):
title     9125 non-null object
genres    9125 non-null object
dtypes: object(2)
memory usage: 213.9+ KB


### Function to extract movie metadata

In [23]:
def movie_meta(movie_id):
    title = metadata.at[movie_id, 'title']
    genres = metadata.at[movie_id, 'genres']
    return title, genres,

In [25]:
movie_meta(2)

('Jumanji (1995)', 'Adventure|Children|Fantasy')

#### Function to get top_n most rated skills for an specific user id

In [64]:
def favorite_movies(user_id, top_n_fav):
    device_ratings = rating_data.loc[rating_data['userId'] == user_id]
    sorted_ratings = pd.DataFrame.sort_values(rating_data, ['rating'], ascending=[0])[:top_n_fav]
    sorted_ratings['MetaData'] = sorted_ratings['movieId'].apply(movie_meta)
    return sorted_ratings

In [37]:
favorite_movies(85, 3)

Unnamed: 0,userId,movieId,rating,timestamp,MetaData
33889,242,2929,5.0,956687566,"(Reds (1981), Drama|Romance)"
36867,265,1233,5.0,960056214,"(Boot, Das (Boat, The) (1981), Action|Drama|War)"
46251,337,1356,5.0,1447176421,"(Star Trek: First Contact (1996), Action|Adven..."


####  Most rated movies by user (Rate count)

In [39]:
movies_per_user = rating_data['movieId'].value_counts()
# Just for preview the name instead of the Skill Id
rating_data['movieId'].apply(movie_meta).value_counts().head(15)

(Forrest Gump (1994), Comedy|Drama|Romance|War)                                     341
(Pulp Fiction (1994), Comedy|Crime|Drama|Thriller)                                  324
(Shawshank Redemption, The (1994), Crime|Drama)                                     311
(Silence of the Lambs, The (1991), Crime|Horror|Thriller)                           304
(Star Wars: Episode IV - A New Hope (1977), Action|Adventure|Sci-Fi)                291
(Jurassic Park (1993), Action|Adventure|Sci-Fi|Thriller)                            274
(Matrix, The (1999), Action|Sci-Fi|Thriller)                                        259
(Toy Story (1995), Adventure|Animation|Children|Comedy|Fantasy)                     247
(Schindler's List (1993), Drama|War)                                                244
(Terminator 2: Judgment Day (1991), Action|Sci-Fi)                                  237
(Star Wars: Episode V - The Empire Strikes Back (1980), Action|Adventure|Sci-Fi)    234
(Braveheart (1995), Action|Drama

#### Users that has rated more movies

In [40]:
user_per_movie = rating_data['userId'].value_counts()
user_per_movie.head(15)

547    2391
564    1868
624    1735
15     1700
73     1610
452    1340
468    1291
380    1063
311    1019
30     1011
294     947
509     923
580     922
213     910
212     876
Name: userId, dtype: int64

#### Optional: Consider cleaning the data by removing users or movies that does not have much rate data

In [45]:
REQUIRED_RATINGS = 5
# Only Skills that have been rated more than REQUIRED_RATING times
rating_data = rating_data.loc[rating_data['movieId'].isin(movies_per_user[movies_per_user >= REQUIRED_RATINGS].index)]

In [47]:
rating_data = rating_data.loc[rating_data['userId'].isin(user_per_movie[user_per_movie >= REQUIRED_RATINGS].index)]

## 2. Train model with rating data

In [56]:
reader = Reader()
rating_dataset = Dataset.load_from_df(rating_data, reader=reader)
train_set = rating_dataset.build_full_trainset()

In [60]:
from models import KnnUB, KnnIB, SVDRecommender

user_based_model = KnnUB()
movies_base_model = KnnIB()
svd_base_model = SVDRecommender()

In [61]:
user_based_model.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<models.KnnUB at 0x19846e5d048>

In [62]:
movies_base_model.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<models.KnnIB at 0x19846e5d080>

In [63]:
svd_base_model.fit(train_set)

<models.SVDRecommender at 0x19846e5d0b8>

## 3. Test Model Recommendations

In [69]:
n = 10
test_user = 85
favorite_movies(test_user, 15)

Unnamed: 0,userId,movieId,rating,MetaData
59858,432,48780,5.0,"(Prestige, The (2006), Drama|Mystery|Sci-Fi|Th..."
46030,331,5991,5.0,"(Chicago (2002), Comedy|Crime|Drama|Musical)"
12584,78,38499,5.0,"(Angels in America (2003), Drama|Fantasy)"
12585,78,39183,5.0,"(Brokeback Mountain (2005), Drama|Romance)"
74200,518,593,5.0,"(Silence of the Lambs, The (1991), Crime|Horro..."
57335,415,110,5.0,"(Braveheart (1995), Action|Drama|War)"
89007,592,3948,5.0,"(Meet the Parents (2000), Comedy)"
46031,331,52975,5.0,"(Hairspray (2007), Comedy|Drama|Musical)"
12591,78,46723,5.0,"(Babel (2006), Drama|Thriller)"
57334,415,50,5.0,"(Usual Suspects, The (1995), Crime|Mystery|Thr..."


### Test collaborative filtering based on user recommendations

In [70]:
recommendations = user_based_model.get_top_n_recommendations(test_user, n=n)
for movie_id, rate_sum in recommendations:
        print(movie_meta(movie_id), rate_sum)

Computing the cosine similarity matrix...
Done computing similarity matrix.
('Inception (2010)', 'Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX') 3.3
('Star Wars: Episode V - The Empire Strikes Back (1980)', 'Action|Adventure|Sci-Fi') 2.4
('Bourne Identity, The (1988)', 'Action|Adventure|Drama|Mystery|Thriller') 2.0
('Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)', 'Action|Drama|Romance') 2.0
('Dark Knight, The (2008)', 'Action|Crime|Drama|IMAX') 2.0
('Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)', 'Action|Adventure|Western') 1.9
('Departed, The (2006)', 'Crime|Drama|Thriller') 1.9
('Dark Knight Rises, The (2012)', 'Action|Adventure|Crime|IMAX') 1.9
('Back to the Future (1985)', 'Adventure|Comedy|Sci-Fi') 1.9
('Gravity (2013)', 'Action|Sci-Fi|IMAX') 1.8
('Fight Club (1999)', 'Action|Crime|Drama|Thriller') 1.8
