In [1]:
try:
    import surprise # Surprise focuses on collaborative filtering, 
    # which is a popular technique for building recommender systems based on user-item interactions.
except:
    !pip install scikit-surprise
    import surprise

In [2]:
import os

from surprise import SVD        # SVD - Singular Value Decomposition
from surprise import Dataset    # Dataset is a class in surprise for loading data
from surprise import accuracy   # accuracy is a class in surprise for calculating accuracy metrics
from surprise import Reader     # Reader is a class in surprise for parsing data
from surprise.model_selection import train_test_split

import pandas as pd

In [3]:
# !wget https://github.com/ALKONDR/netology-recsys/archive/refs/heads/master.zip
# !unzip master.zip

In [4]:
os.chdir(path="/Users/velo1/SynologyDrive/GIT_syno/data/MovieLens _ml-latest-small")
os.getcwd()
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [7]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,3.0,851866700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9.0,4.0,938629200.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13.0,5.0,1331380000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.0,997938300.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19.0,3.0,855190100.0


In [8]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [9]:
dataset =  movies_with_ratings[['userId', 'title', 'rating']]
dataset.columns = ['uid', 'iid', 'rating']

In [10]:
?Reader

[0;31mInit signature:[0m
[0mReader[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mline_format[0m[0;34m=[0m[0;34m'user item rating'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msep[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrating_scale[0m[0;34m=[0m[0;34m([0m[0;36m1[0m[0;34m,[0m [0;36m5[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mskip_lines[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
The Reader class is used to parse a file containing ratings.

Such a file is assumed to specify only one rating per line, and each line
needs to respect the following structure: ::

    user ; item ; rating ; [timestamp]

where the order of the fields and the separator (here ';') may be
arbitrarily defined (see below).  brackets indicate that the timestamp
field is optional.

For each bui

In [11]:
# The Reader class is used to parse a file containing ratings.
# Such a file is assumed to specify only one rating per line, and each line
# needs to respect the following structure: ::
#     user ; item ; rating ; [timestamp]
reader = Reader(
    rating_scale=(0.5, 5.0)
)  # rating_scale - The rating scale used for every rating in the dataset. Default is (1, 5).
data = Dataset.load_from_df(
    dataset, reader
)  # load_from_df - Load a dataset defined in a pandas dataframe. The dataframe must have three columns,
#  corresponding to the user (raw) ids, the item (raw) ids, and the ratings, in this order.

In [12]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [13]:
%%time
algo = SVD(n_factors=20, n_epochs=20)   # n_factors - The number of factors. Default is ``100``.
                                    # n_epochs - The number of iteration of the SGD procedure. Default is ``20``.
algo.fit(trainset)                    # fit - Train an algorithm on a given dataset.

CPU times: user 575 ms, sys: 4.95 ms, total: 580 ms
Wall time: 583 ms


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11c032e90>

In [14]:
test_pred = algo.test(testset)
test_pred[:5]

[Prediction(uid=463.0, iid='Golden Child, The (1986)', r_ui=3.0, est=2.740079747410808, details={'was_impossible': False}),
 Prediction(uid=346.0, iid='Fantasia 2000 (1999)', r_ui=1.0, est=3.6915270055126097, details={'was_impossible': False}),
 Prediction(uid=623.0, iid='Reservoir Dogs (1992)', r_ui=5.0, est=4.446181029931582, details={'was_impossible': False}),
 Prediction(uid=5.0, iid='When Harry Met Sally... (1989)', r_ui=4.0, est=4.123606694257472, details={'was_impossible': False}),
 Prediction(uid=312.0, iid='Get Shorty (1995)', r_ui=2.0, est=3.3375316555118006, details={'was_impossible': False})]

In [15]:
# the prediction for the user with ID 463 and the movie "Golden Child, The (1986)" 
# was not considered impossible by the model.

# It is a flag that indicates if the model was able to generate a 
# prediction for the given combination of user and item.
# If the value of 'was_impossible' were True, it would mean that the model encountered some
#  issue or limitation that prevented it from generating a prediction for that specific user-item pair. 
# This could occur if the user or item is completely unknown to the model or if 
# there is insufficient data available to make a reliable prediction.

In [16]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8835


0.8835316359423798

In [17]:
# compute the prediction for the user with ID 2 and the movie "Mortal Kombat (1995)"
algo.predict(uid=2.0, iid='Mortal Kombat (1995)')   


Prediction(uid=2.0, iid='Mortal Kombat (1995)', r_ui=None, est=2.788326216064605, details={'was_impossible': False})

In [18]:
?algo.predict

[0;31mSignature:[0m [0malgo[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0muid[0m[0;34m,[0m [0miid[0m[0;34m,[0m [0mr_ui[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mclip[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute the rating prediction for given user and item.

The ``predict`` method converts raw ids to inner ids and then calls the
``estimate`` method which is defined in every derived class. If the
prediction is impossible (e.g. because the user and/or the item is
unknown), the prediction is set according to
:meth:`default_prediction()
<surprise.prediction_algorithms.algo_base.AlgoBase.default_prediction>`.

Args:
    uid: (Raw) id of the user. See :ref:`this note<raw_inner_note>`.
    iid: (Raw) id of the item. See :ref:`this note<raw_inner_note>`.
    r_ui(float): The true rating :math:`r_{ui}`. Optional, default is
        ``None``.
    clip(bool): Whether to cl