## Collaborative Filtering Recommender

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
movies = pd.read_csv('data/MoviesMetadata.csv')
movies.head()

Unnamed: 0,budget,id,imdb_id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,...,title,vote_average,vote_count,name_genres,id_genres,name_production_countries,iso_3166_1_production_countries,name_production_companies,id_production_companies,year
0,30.0,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,373.554033,...,Toy Story,7.7,5415.0,"Animation, Comedy, Family","16, 35, 10751",United States of America,US,Pixar Animation Studios,3,1995
1,65.0,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,262.797249,...,Jumanji,6.9,2413.0,"Adventure, Fantasy, Family","12, 14, 10751",United States of America,US,"TriStar Pictures, Teitler Film, Interscope Com...","559, 2550, 10201",1995
2,0.0,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,0.0,...,Grumpier Old Men,6.5,92.0,"Romance, Comedy","10749, 35",United States of America,US,"Warner Bros., Lancaster Gate","6194, 19464",1995
3,16.0,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,81.452156,...,Waiting to Exhale,6.1,34.0,"Comedy, Drama, Romance","35, 18, 10749",United States of America,US,Twentieth Century Fox Film Corporation,306,1995
4,0.0,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76.578911,...,Father of the Bride Part II,5.7,173.0,Comedy,35,United States of America,US,"Sandollar Productions, Touchstone Pictures","5842, 9195",1995


In [3]:
movies.drop(columns=['iso_3166_1_production_countries', 'id_production_companies', 'id_genres'], inplace=True)
movies.shape

(45443, 20)

In [4]:
ratings = pd.read_csv('data/ratings_small.csv')
ratings.shape

(100004, 4)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [11]:
%pip install pip==21.3.1

Collecting pip==21.3.1
  Downloading pip-21.3.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-21.3.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m21.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
%pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit-surprise-1.1.3.tar.gz (771 kB)
  Preparing metadata (setup.py) ... [?25ldone
Using legacy 'setup.py install' for scikit-surprise, since package 'wheel' is not installed.
Installing collected packages: scikit-surprise
    Running setup.py install for scikit-surprise ... [?25ldone
[?25hSuccessfully installed scikit-surprise-1.1.3
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

reader = Reader()
ratings_data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, ratings_data, measures=['RMSE'], cv=10)

{'test_rmse': array([0.89601842, 0.88828855, 0.88176298, 0.89923175, 0.8893432 ,
        0.89096748, 0.89024948, 0.89823415, 0.90136982, 0.88900282]),
 'fit_time': (0.6667797565460205,
  0.6897258758544922,
  0.677588939666748,
  0.7285287380218506,
  0.6985700130462646,
  0.6830501556396484,
  0.6768419742584229,
  0.675879955291748,
  0.7046229839324951,
  0.6932270526885986),
 'test_time': (0.03310108184814453,
  0.03116321563720703,
  0.08152413368225098,
  0.03103327751159668,
  0.035234689712524414,
  0.08339476585388184,
  0.02978992462158203,
  0.02930903434753418,
  0.08063387870788574,
  0.030893802642822266)}

In [7]:
train = ratings_data.build_full_trainset()
svd.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x127d422d0>

In [8]:
user_rating = pd.merge(ratings, movies, left_on='movieId', right_on='id', how='inner')
user_rating = user_rating[['userId', 'movieId', 'rating', 'original_title']]
user_ratings = user_rating.sort_values(by='userId')
user_ratings.head()

Unnamed: 0,userId,movieId,rating,original_title
0,1,1371,2.5,Rocky III
182,1,2294,2.0,Jay and Silent Bob Strike Back
235,1,2455,2.5,Vivement dimanche!
47,1,1405,1.0,Greed
140,1,2193,2.0,My Tutor


In [9]:
user_ratings[user_ratings['userId'] == 10]

Unnamed: 0,userId,movieId,rating,original_title
10666,10,1127,4.0,Princesas
21696,10,2108,3.0,The Breakfast Club
6413,10,318,4.0,The Million Dollar Hotel
21582,10,1499,3.0,Teenage Mutant Ninja Turtles III
7928,10,2841,4.0,Un long dimanche de fiançailles
10506,10,1089,3.0,Point Break
21610,10,1611,5.0,Das Wunder von Bern
21762,10,3019,4.0,Dr. Jekyll and Mr. Hyde
5761,10,592,3.0,The Conversation
17287,10,1358,5.0,A Brief History of Time


In [20]:
 movies[movies['original_title'] == 'The Dark Knight'][['original_title','id']]

Unnamed: 0,original_title,id
12478,The Dark Knight,155
28688,The Dark Knight,72003


In [25]:
svd.predict(10, 1127)

Prediction(uid=10, iid=1127, r_ui=None, est=3.5746443918321447, details={'was_impossible': False})

In [27]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45443 entries, 0 to 45442
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   budget                     45443 non-null  float64
 1   id                         45443 non-null  int64  
 2   imdb_id                    45443 non-null  object 
 3   original_language          45443 non-null  object 
 4   original_title             45443 non-null  object 
 5   overview                   44489 non-null  object 
 6   popularity                 45443 non-null  float64
 7   poster_path                45443 non-null  object 
 8   release_date               45359 non-null  object 
 9   revenue                    45443 non-null  float64
 10  runtime                    45443 non-null  float64
 11  status                     45362 non-null  object 
 12  tagline                    20407 non-null  object 
 13  title                      45443 non-null  obj