> Igor Sorochan DSU-31
## Домашнее задание по теме «Коллаборативная фильтрация»


In [1]:
import os
import numpy as np
import pandas as pd
try:
    import surprise # Surprise focuses on collaborative filtering, 
    # which is a popular technique for building recommender systems based on user-item interactions.
except:
    !pip install scikit-surprise
    import surprise

from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, SVD, SVDpp, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, NMF, SlopeOne, CoClustering
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
from surprise import accuracy
from tqdm import tqdm

![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*fUF83M0uHXwVfSxQ66uZ3g.png)

* In the context of recommendation systems and matrix factorization, factorization refers to the process of decomposing a matrix into two lower-rank matrices. Specifically, when applying matrix factorization to a user-item interaction matrix, such as the ratings matrix in a collaborative filtering approach, the goal is to find lower-dimensional representations of users and items that capture their latent factors or features.

* The factorization process aims to discover latent factors that explain the observed user-item interactions. Each user and item is represented as a vector of latent factors, and the dot product of these vectors estimates the missing or unknown ratings in the matrix. By factorizing the matrix, we can reduce its dimensionality and approximate the original matrix with a lower-rank approximation.

* Singular Value Decomposition (SVD) is a popular matrix factorization technique used in recommendation systems. It decomposes the user-item interaction matrix into three matrices: U (user factors), Σ (singular values or weights), and V^T (item factors). By selecting a lower number of latent factors, we can approximate the original matrix using a subset of the singular values and their corresponding factors.

* The factorization process allows the model to learn the underlying patterns and relationships in the data, capturing the user preferences and item characteristics. These latent factors can then be used to predict missing ratings or make personalized recommendations for users based on their similarities to other users or items in the latent factor space.

In [2]:
# ?Dataset.load_from_file

In [3]:
# Load the movielens-1m dataset (download it if needed).
# Dataset.load_builtin(name='ml-1m', prompt=True)

In [4]:
# UserID::Gender::Age::Occupation::Zip-code
users = pd.read_csv("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/users.dat", sep="::", 
    header=None, names=["userId", "Gender", "Age", "Occupation","Zip-code"], engine="python")
users

Unnamed: 0,userId,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [5]:
ratings = pd.read_csv("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/ratings.dat", sep="::", 
    header=None, names=["userId", "movieId", "rating", "timestamp"], engine="python")
ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
min_rating = ratings.rating.min()
max_rating = ratings.rating.max()
print("min_rating: ", min_rating, "max_rating: ", max_rating)

min_rating:  1 max_rating:  5


### 1. Load data from custom DataFrame 

In [7]:
reader = Reader(line_format='user item rating', sep='::', rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader) # load_from_df expects only three columns
data

<surprise.dataset.DatasetAutoFolds at 0x145efb910>

In [8]:
# MovieID::Title::Genres
movies = pd.read_csv("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/movies.dat", sep="::",
    header=None, names=["movieId", "Title", "Genres"], engine="python", encoding="latin-1")
print(movies.shape)
movies.head(3)


(3883, 3)


Unnamed: 0,movieId,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


### 2. Load data from MovieLens original dataset

In [9]:
# # Pay attention to the actual columns names in the data source file and the names in the code below!
# reader = Reader(line_format='user item rating timestamp', sep='::', rating_scale=(min_rating, max_rating))
# data = Dataset.load_from_file("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/ratings.dat", reader=reader)
# data

#### Defining the best algorithm

In [10]:
algos = [
    SVD(),
    BaselineOnly(),
    KNNBasic(),
    KNNWithMeans(),
    KNNWithZScore(),
    KNNBaseline(),
    NMF(),
    SlopeOne(),
    CoClustering(),
]
results = {}
algo_results = {}
for algo in tqdm(algos, desc="Running algorithms"):
    algo_name = algo.__class__.__name__
    print(f"Algorithm {algo_name} is running...")
    algo_results[algo_name] = cross_validate(
        algo,  #  Algorithm predicting the baseline estimate for given user and item.
        data,  # data to be used for cross-validation
        verbose=False,  # print the performance metric
        cv=5,  # 5-fold cross validation
        measures=["RMSE", "MAE"],
        n_jobs=-1,  # use all available CPU cores for parallel processing.
        pre_dispatch="2*n_jobs",  # twice the number of available CPU cores.
        return_train_measures=True,  # return the train error measures.
    )

    mean_rmse = algo_results[algo_name]['test_rmse'].mean()
    results[algo_name] = {'mean_rmse': mean_rmse}

print("\nResults:")
for algo, result in results.items():
    print(f"Algorithm: {algo}")
    print(f"Mean RMSE: {result['mean_rmse']}")
    print("-------")

Running algorithms:   0%|          | 0/9 [00:00<?, ?it/s]

Algorithm SVD is running...


Running algorithms:  11%|█         | 1/9 [00:24<03:17, 24.74s/it]

Algorithm BaselineOnly is running...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Running algorithms:  22%|██▏       | 2/9 [00:41<02:21, 20.16s/it]

Algorithm KNNBasic is running...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  33%|███▎      | 3/9 [07:59<21:05, 210.89s/it]

Algorithm KNNWithMeans is running...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  44%|████▍     | 4/9 [15:58<26:22, 316.55s/it]

Algorithm KNNWithZScore is running...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  56%|█████▌    | 5/9 [23:57<25:01, 375.41s/it]

Algorithm KNNBaseline is running...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  67%|██████▋   | 6/9 [32:05<20:40, 413.41s/it]

Algorithm NMF is running...


Running algorithms:  78%|███████▊  | 7/9 [32:35<09:36, 288.28s/it]

Algorithm SlopeOne is running...


Running algorithms:  89%|████████▉ | 8/9 [36:23<04:29, 269.06s/it]

Algorithm CoClustering is running...


Running algorithms: 100%|██████████| 9/9 [36:52<00:00, 245.85s/it]


Results:
Algorithm: SVD
Mean RMSE: 0.8738138290292629
-------
Algorithm: BaselineOnly
Mean RMSE: 0.908669611756055
-------
Algorithm: KNNBasic
Mean RMSE: 0.9227669044457905
-------
Algorithm: KNNWithMeans
Mean RMSE: 0.9292157039869107
-------
Algorithm: KNNWithZScore
Mean RMSE: 0.9303767954613864
-------
Algorithm: KNNBaseline
Mean RMSE: 0.8949845224608657
-------
Algorithm: NMF
Mean RMSE: 0.9162409553178584
-------
Algorithm: SlopeOne
Mean RMSE: 0.9066256907624094
-------
Algorithm: CoClustering
Mean RMSE: 0.9165630008332254
-------





In [11]:
pd.DataFrame(results).T.sort_values("mean_rmse").style.background_gradient(cmap="coolwarm")

Unnamed: 0,mean_rmse
SVD,0.873814
KNNBaseline,0.894985
SlopeOne,0.906626
BaselineOnly,0.90867
NMF,0.916241
CoClustering,0.916563
KNNBasic,0.922767
KNNWithMeans,0.929216
KNNWithZScore,0.930377


Singular value decomposition (SVD) algorithm is the best algorithm for the given dataset.  
It is a matrix factorization technique that is usually very effective for recommender systems.  
It is a collaborative filtering algorithm that decomposes the user-item matrix by keeping the most important latent features that capture the majority of the item ratings.  
It is a popular algorithm for recommender systems because it can deal with the sparsity of the user-item matrix by extracting the most important latent features related to the users and items.  

Let's try to fine tune the SVD algorithm to get the best results.

In [12]:
param_grid = {
    # "n_epochs": [100],
    # "lr_all": [0.002],
    # "reg_all": [0.4],
    "n_factors": [39],
    "random_state": [42],
    "init_mean": [0.1],
    "init_std_dev": [0.047],
    "verbose": [False],
}
gs_svd = GridSearchCV(algo_class=SVD, param_grid=param_grid, measures=["rmse", "mae"], cv=5, return_train_measures=False,
                  n_jobs=-1, joblib_verbose=100, refit=True, pre_dispatch='2*n_jobs',)

gs_svd.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.8s remaining:   11.7s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    8.8s remaining:    5.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.7s finished


In [13]:
print(f'Best RMSE score is {gs_svd.best_score["rmse"]}')
print(f'Best combination of parameters that gave the best RMSE score:')
print(gs_svd.best_params["rmse"])

Best RMSE score is 0.8630002483898362
Best combination of parameters that gave the best RMSE score:
{'n_factors': 39, 'random_state': 42, 'init_mean': 0.1, 'init_std_dev': 0.047, 'verbose': False}


In [14]:
res = cross_validate(
    SVDpp(),                    # SVD is singular value decomposition (SVD) algorithm.
    data,                       # data to be used for cross-validation         
    verbose=True,               # print the performance metric
    cv=5,                       # 5-fold cross validation
    measures=["RMSE"],          # , "MAE"
    n_jobs=-1,                  # use all available CPU cores for parallel processing.
    pre_dispatch="2*n_jobs",    # twice the number of available CPU cores.
    return_train_measures=False, # return the train error measures.
)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8626  0.8631  0.8645  0.8609  0.8622  0.8626  0.0012  
Fit time          379.25  374.29  374.50  374.94  376.85  375.97  1.88    
Test time         70.59   70.05   68.16   66.86   69.93   69.12   1.39    


The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.  
Even without hyperparameter tuning, SVDpp is better than SVD on the MovieLens dataset.  

Let's try to fine tune the SVDpp algorithm to get the best results.


In [15]:
param_grid = {
    # "n_epochs": [100],
    # "lr_all": [0.002],
    # "reg_all": [0.4],
    "n_factors": [39],
    "random_state": [42],
    "init_mean": [0.1],
    "init_std_dev": [0.045],
    "verbose": [False],
}
gs_svdpp = GridSearchCV(algo_class=SVDpp, param_grid=param_grid, measures=["rmse", "mae"], cv=5, return_train_measures=False,
                  n_jobs=-1, joblib_verbose=100, refit=True, pre_dispatch='2*n_jobs',)

gs_svdpp.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 12.6min remaining: 19.0min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 12.7min remaining:  8.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 12.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 12.7min finished


## I bet we can get better results with the SVDpp algorithm.  
## Could we really get RMSE under 0.86?  
## Take a look ...

In [20]:
print(f'Best RMSE score is {gs_svdpp.best_score["rmse"]}')


Best RMSE score is 0.8593545531308449


In [21]:
print(f'Best combination of parameters that gave the best RMSE score:')
print(gs_svdpp.best_params["rmse"])

Best combination of parameters that gave the best RMSE score:
{'n_factors': 39, 'random_state': 42, 'init_mean': 0.1, 'init_std_dev': 0.045, 'verbose': False}


### OK. Fine!   Let's make some recommendations

In [17]:
def give_recommendation(model, user_id, ratings, movies, users, n_items):
   # Get a list of all movie IDs from dataset
   movie_ids = movies["movieId"].unique()
 
   # Get a list of all movie IDs that have been watched by user
   movie_ids_user = ratings.loc[ratings["userId"] == user_id, "movieId"]
#    movie_ids_user = ratings[ratings["userId"] == user_id]["movieId"]

    # Get a list off all movie IDS that that have NOT been watched by user
    # returns an array containing the values in movie_ids that are not present in movie_ids_user.
   movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user) # setdiff1d: Find the set difference of two arrays.
 
   # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
   test_set = [[user_id, movie_id, 4] for movie_id in movie_ids_to_pred]
 
   # Predict the ratings and generate recommendations
   predictions = model.test(test_set)
   pred_ratings = np.array([pred.est for pred in predictions])
   u_age = users[users['userId'] == user_id]['Age'].values[0]
   u_gender = users[users['userId'] == user_id]['Gender'].values[0]
   print(f"Top {n_items} item recommendations for user_id {user_id} (age:{u_age}, gender:{u_gender}):")
   print(f"{'-' * 40}")
   # Rank top-n movies based on the predicted ratings
   # -pred_ratings negates all the predicted ratings. This is done because the argsort() function sorts the array 
   # in ascending order by default, but we want to find the items with the highest ratings, 
   # so we need to sort them in descending order.
   # [:n_items] slices the sorted indices to select only the first n_items indices, which represent the items 
   # with the highest predicted ratings.
   index_max = (-pred_ratings).argsort()[:n_items] 
   for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(f'{movies[movies["movieId"]==movie_id]["Title"].values[0]:<65}:{pred_ratings[i]:.2f}')


### Let's define recommendations for a particular user.

In [18]:
# define which user ID that we want to give recommendation
userID = 23
# define how many top-n movies that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
give_recommendation(gs_svdpp,userID,ratings,movies, users, n_items)

Top 10 item recommendations for user_id 23 (age:35, gender:M):
----------------------------------------
Go (1999)                                                        :4.29
Raiders of the Lost Ark (1981)                                   :4.28
Young Frankenstein (1974)                                        :4.21
High Fidelity (2000)                                             :4.17
South Park: Bigger, Longer and Uncut (1999)                      :4.13
Monty Python and the Holy Grail (1974)                           :4.08
Best in Show (2000)                                              :4.06
Who Framed Roger Rabbit? (1988)                                  :4.06
American Werewolf in London, An (1981)                           :4.06
Toy Story 2 (1999)                                               :4.05


### Let's predict a rating for a particular user and a particular movie.

In [19]:
userID = 23
movie_to_predict = 1
title = movies[movies['movieId'] == movie_to_predict]['Title'].values[0]
u_age = users[users['userId'] == userID]['Age'].values[0]
u_gender = users[users['userId'] == userID]['Gender'].values[0]
print(f"Predicted rating by user_id:{userID} (age:{u_age}, gender:{u_gender}):")
print(f"for movie '{title}' is {gs_svd.predict(userID, movie_to_predict).est:.2f}")

Predicted rating by user_id:23 (age:35, gender:M):
for movie 'Toy Story (1995)' is 3.86
