In [None]:
!pip install surprise

import sys
import random
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.dataset import DatasetAutoFolds
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import operator
import requests
from zipfile import ZipFile

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505179 sha256=73e688a1ba51965af431022e2d16e8bccd4a949855f3063592f71bbfeedaf20f
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Install

In [None]:
import random

df = pd.read_csv('/content/ratings.csv',
                 skiprows=lambda i: i > 0 and random.random() > 0.002,
                 on_bad_lines="skip")

print(len(df))

49882


In [None]:
df=df[0:20000]

In [None]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df[['userId','movieId','rating']], reader)
print(type(data))

<class 'surprise.dataset.DatasetAutoFolds'>


In [None]:
# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25)
print(type(trainset))

<class 'surprise.trainset.Trainset'>


In [None]:
import itertools

for uid, iid, rating in itertools.islice(trainset.all_ratings(), 5):
    print(f"User {uid} rated item {iid} with a rating of {rating}")

print()
for uid, iid, rating in testset[:5]:
    print(f"User {uid} rated item {iid} with a rating of {rating}")

print()
print(trainset.n_ratings,len(testset))

User 0 rated item 0 with a rating of 4.5
User 1 rated item 1 with a rating of 4.0
User 1 rated item 701 with a rating of 4.5
User 1 rated item 1147 with a rating of 2.5
User 2 rated item 2 with a rating of 1.5

User 18019 rated item 3793 with a rating of 1.0
User 1899 rated item 1911 with a rating of 3.5
User 56153 rated item 1201 with a rating of 4.0
User 54621 rated item 4226 with a rating of 3.5
User 59190 rated item 1639 with a rating of 3.0

15000 5000


In [None]:
# Use the KNNBasic algorithm to train the model
algo = KNNBasic()
algo.fit(trainset)

In [None]:
# Test the model on the testing set
predictions = algo.test(testset)

In [None]:
for uid, iid, rating in testset[:5]:
    print(f"User {uid} rated item {iid} with a rating of {rating}")

In [None]:
for prediction in predictions[0:5]:
    print(prediction)

In [None]:
# Print the performance metrics
accuracy.rmse(predictions)

In [None]:
true_ratings = [pred.r_ui for pred in predictions]
est_ratings = [pred.est for pred in predictions]
uids = [pred.uid for pred in predictions]

In [None]:
data=pd.DataFrame(columns=["true","predicted"])
data["true"]=true_ratings
data["predicted"]=est_ratings
g = sns.jointplot(data=data,x="true", y="predicted", kind="kde",)
g.fig.suptitle('Test predictions',fontsize=12)
plt.show()

In [None]:
movies=pd.read_csv('/content/movies.csv')
mapping = movies.set_index("movieId")["title"].to_dict()

In [None]:
users=list(set(uids))

In [None]:
# items which the user not yet evaluate
items = trainset.build_anti_testset()
for user in users[0:30]:
    user_items = list(filter(lambda x: x[0] == user, items))
    #print(user,len(user_items))
    # generate recommendation
    recommendations = algo.test(user_items)
    if len(recommendations)>0:
        recommendations.sort(key=operator.itemgetter(3), reverse=True)
        print(f"For User {user}, Recommendations:")
        for r in recommendations[0:5]:
            print(f"  {mapping[r[1]]} : Estimated rating {round(r[3],4)}")

For User 57344, Recommendations:
  Training Day (2001) : Estimated rating 3.5238
  Paddington (2014) : Estimated rating 3.5238
  Napoleon Dynamite (2004) : Estimated rating 3.5238
  Outsiders, The (1983) : Estimated rating 3.5238
  Blade Runner (1982) : Estimated rating 3.5238
For User 32778, Recommendations:
  Pretty Woman (1990) : Estimated rating 5
  Shaft (1971) : Estimated rating 4.0
  Training Day (2001) : Estimated rating 3.5238
  Paddington (2014) : Estimated rating 3.5238
  Napoleon Dynamite (2004) : Estimated rating 3.5238
For User 13, Recommendations:
  The Fate of the Furious (2017) : Estimated rating 5
  Gone Baby Gone (2007) : Estimated rating 4.0
  Training Day (2001) : Estimated rating 3.5238
  Paddington (2014) : Estimated rating 3.5238
  Napoleon Dynamite (2004) : Estimated rating 3.5238
For User 19, Recommendations:
  Ace Ventura: Pet Detective (1994) : Estimated rating 5
  Star Wars: Episode V - The Empire Strikes Back (1980) : Estimated rating 4.5
  Kill Bill: Vol.