In this task, I am going to develop used based colloberative filtering to suggest movies based on the historical data for similar users

In [1]:
# Downloading the Ratings Dataset
!wget https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/movielens_data/ratings.csv

--2022-07-12 14:17:11--  https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/movielens_data/ratings.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2438266 (2.3M) [text/plain]
Saving to: ‘ratings.csv’


2022-07-12 14:17:12 (38.7 MB/s) - ‘ratings.csv’ saved [2438266/2438266]



In [2]:
# Downloading the Movies Dataset
!wget https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/movielens_data/movies.csv

--2022-07-12 14:17:12--  https://raw.githubusercontent.com/susanli2016/Machine-Learning-with-Python/master/movielens_data/movies.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 458390 (448K) [text/plain]
Saving to: ‘movies.csv’


2022-07-12 14:17:12 (12.4 MB/s) - ‘movies.csv’ saved [458390/458390]



In [3]:
# install scikit-surprise library
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.1 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633743 sha256=5fee88954ef9e1d252b6494a3d3043b95f3d41f49a85fcc669b28581b7a2dca5
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [4]:
# let us import the libraries

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [5]:
# load the movies dataset
movies =pd.read_csv('movies.csv')
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [6]:
# load the ratings dataset

ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [8]:
# save the ratings Dataframe into csv format as surprise library reads data only from csv

ratings.to_csv('ratings_modified.csv', header=None, index=False)

In [9]:
# import Surprise library
from surprise import Dataset, Reader

In [10]:
file_path = 'ratings_modified.csv'
reader = Reader(line_format='user item rating', sep=',',rating_scale = (1,5) )

In [11]:
data = Dataset.load_from_file(file_path, reader=reader)


In [12]:
train = data.build_full_trainset()

print('The number of users:', train.n_users)
print('The number of movies:', train.n_items)


The number of users: 671
The number of movies: 9066


In [13]:
# let us build the model
from surprise import KNNWithMeans

# user based colloberative filtering
similarity_options = {'name':'pearson','user_based':True}

algo = KNNWithMeans(k=15,min_k = 5,sim_options=similarity_options,
                    verbose=True)

In [14]:
# let's train and check the accuracy of the model
from surprise.model_selection import cross_validate

results = cross_validate(algo = algo,data=data,measures=['RMSE'],
                         cv= 5, return_train_measures=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [15]:
print(results['test_rmse'])

[0.92794951 0.93148117 0.91588968 0.91311614 0.91612039]


In [16]:
print(results['test_rmse'].mean())

0.920911378394677


In [17]:
# let us train the model
algo.fit(train)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f0fef749250>

In [18]:
# let us create a dictionary with movieId and Movie title
movie_id_to_title = {}
for m_id, title in zip(movies['movieId'].values, movies['title'].values):
  movie_id_to_title[str(m_id)]=title

In [19]:
#show top 10 items
list(movie_id_to_title.items())[:10]


[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)'),
 ('4', 'Waiting to Exhale (1995)'),
 ('5', 'Father of the Bride Part II (1995)'),
 ('6', 'Heat (1995)'),
 ('7', 'Sabrina (1995)'),
 ('8', 'Tom and Huck (1995)'),
 ('9', 'Sudden Death (1995)'),
 ('10', 'GoldenEye (1995)')]

In [20]:
val = algo.predict(uid = '1', iid='31')
val

Prediction(uid='1', iid='31', r_ui=None, est=2.1823846561780167, details={'actual_k': 9, 'was_impossible': False})

In [21]:
print(movie_id_to_title[val[1]], val[3])

Dangerous Minds (1995) 2.1823846561780167


In [22]:
# create Top n predictions


from collections import defaultdict

def get_top_n(predictions, n=10):
  top_n = defaultdict(list)
  for uid, iid, true_r,est,_ in predictions:
    top_n[uid].append((iid, est))

  for uid, user_ratings in top_n.items():
    user_ratings.sort(key = lambda x:x[1], reverse=True)
    top_n[uid] = user_ratings[:n]
  return top_n

In [23]:
testdata = train.build_anti_testset()
predictions = algo.test(testdata)


In [24]:
top_n = get_top_n(predictions, n = 10)

In [25]:
# Lets Create a Function to Predict Movies to the Users based on the Movies Watched Previously
def UserPredictions(user_id , top_n , item_map):
    print("Predictions for User Id : " , user_id)
    user_ratings = top_n[user_id]
    for item_id , rating in user_ratings :
        print(item_map[item_id] , " : " , rating)

In [26]:
UserPredictions('3' , top_n , movie_id_to_title)

Predictions for User Id :  3
Bully (2001)  :  4.872069239069532
It Follows (2014)  :  4.852633955245333
Yojimbo (1961)  :  4.793681332869258
Big Night (1996)  :  4.66542975989837
Paradise Lost: The Child Murders at Robin Hood Hills (1996)  :  4.652305800489034
Lion in Winter, The (1968)  :  4.645741231422388
Gladiator (1992)  :  4.6392622966143575
I Shot Andy Warhol (1996)  :  4.633821841582327
Swingers (1996)  :  4.632338314852094
Pride and Prejudice (1995)  :  4.624189574923341
