> Igor Sorochan DSU-31
## Домашнее задание по теме «Коллаборативная фильтрация»


In [1]:
import os
import numpy as np
import pandas as pd
try:
    import surprise # Surprise focuses on collaborative filtering, 
    # which is a popular technique for building recommender systems based on user-item interactions.
except:
    !pip install scikit-surprise
    import surprise

from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, SVD, SVDpp, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, NMF, SlopeOne, CoClustering
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
from surprise import accuracy
from tqdm import tqdm

![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*fUF83M0uHXwVfSxQ66uZ3g.png)

* In the context of recommendation systems and matrix factorization, factorization refers to the process of decomposing a matrix into two lower-rank matrices. Specifically, when applying matrix factorization to a user-item interaction matrix, such as the ratings matrix in a collaborative filtering approach, the goal is to find lower-dimensional representations of users and items that capture their latent factors or features.

* The factorization process aims to discover latent factors that explain the observed user-item interactions. Each user and item is represented as a vector of latent factors, and the dot product of these vectors estimates the missing or unknown ratings in the matrix. By factorizing the matrix, we can reduce its dimensionality and approximate the original matrix with a lower-rank approximation.

* Singular Value Decomposition (SVD) is a popular matrix factorization technique used in recommendation systems. It decomposes the user-item interaction matrix into three matrices: U (user factors), Σ (singular values or weights), and V^T (item factors). By selecting a lower number of latent factors, we can approximate the original matrix using a subset of the singular values and their corresponding factors.

* The factorization process allows the model to learn the underlying patterns and relationships in the data, capturing the user preferences and item characteristics. These latent factors can then be used to predict missing ratings or make personalized recommendations for users based on their similarities to other users or items in the latent factor space.

In [2]:
# ?Dataset.load_from_file

In [3]:
# Load the movielens-1m dataset (download it if needed).
# Dataset.load_builtin(name='ml-1m', prompt=True)

In [4]:
# UserID::Gender::Age::Occupation::Zip-code
users = pd.read_csv("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/users.dat", sep="::", 
    header=None, names=["userId", "Gender", "Age", "Occupation","Zip-code"], engine="python")
users.shape

(6040, 5)

In [5]:
ratings = pd.read_csv("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/ratings.dat", sep="::", 
    header=None, names=["userId", "movieId", "rating", "timestamp"], engine="python")
ratings.shape


(1000209, 4)

In [6]:
min_rating = ratings.rating.min()
max_rating = ratings.rating.max()
print("min_rating: ", min_rating, "max_rating: ", max_rating)

min_rating:  1 max_rating:  5


In [7]:
# MovieID::Title::Genres
movies = pd.read_csv("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/movies.dat", sep="::",
    header=None, names=["movieId", "Title", "Genres"], engine="python", encoding="latin-1")
print(movies.shape)
movies.head(3)


(3883, 3)


Unnamed: 0,movieId,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [8]:
reader = Reader(line_format='user item rating timestamp', sep='::', rating_scale=(min_rating, max_rating))
# reader = Reader(line_format=reader_cols, sep='::', rating_scale=(1, 5))
data = Dataset.load_from_file("/Users/velo1/SynologyDrive/GIT_syno/data/ml-1m/ratings.dat", reader=reader)
data

<surprise.dataset.DatasetAutoFolds at 0x13cee5d90>

#### Defining the best algorithm

In [101]:
algos = [
    SVD(),
    BaselineOnly(),
    KNNBasic(),
    KNNWithMeans(),
    KNNWithZScore(),
    KNNBaseline(),
    NMF(),
    SlopeOne(),
    CoClustering(),
]
results = {}
algo_results = {}
for algo in tqdm(algos, desc="Running algorithms"):
    algo_name = algo.__class__.__name__
    print(f"Algorithm {algo_name} is running...")
    algo_results[algo_name] = cross_validate(
        algo,  #  Algorithm predicting the baseline estimate for given user and item.
        data,  # data to be used for cross-validation
        verbose=False,  # print the performance metric
        cv=5,  # 5-fold cross validation
        measures=["RMSE", "MAE"],
        n_jobs=-1,  # use all available CPU cores for parallel processing.
        pre_dispatch="2*n_jobs",  # twice the number of available CPU cores.
        return_train_measures=True,  # return the train error measures.
    )

    mean_rmse = algo_results[algo_name]['test_rmse'].mean()
    results[algo_name] = {'mean_rmse': mean_rmse}

print("\nResults:")
for algo, result in results.items():
    print(f"Algorithm: {algo}")
    print(f"Mean RMSE: {result['mean_rmse']}")
    print("-------")

Running algorithms:   0%|          | 0/9 [00:00<?, ?it/s]

Algorithm SVD is running...


Running algorithms:  11%|█         | 1/9 [00:23<03:08, 23.61s/it]

Algorithm BaselineOnly is running...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Running algorithms:  22%|██▏       | 2/9 [00:40<02:19, 19.88s/it]

Algorithm KNNBasic is running...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  33%|███▎      | 3/9 [08:55<23:40, 236.80s/it]

Algorithm KNNWithMeans is running...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  44%|████▍     | 4/9 [16:51<27:35, 331.17s/it]

Algorithm KNNWithZScore is running...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  56%|█████▌    | 5/9 [25:32<26:38, 399.57s/it]

Algorithm KNNBaseline is running...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


Running algorithms:  67%|██████▋   | 6/9 [34:12<22:01, 440.58s/it]

Algorithm NMF is running...


Running algorithms:  78%|███████▊  | 7/9 [34:46<10:14, 307.44s/it]

Algorithm SlopeOne is running...


Running algorithms:  89%|████████▉ | 8/9 [38:51<04:47, 287.61s/it]

Algorithm CoClustering is running...


Running algorithms: 100%|██████████| 9/9 [39:18<00:00, 262.07s/it]


Results:
Algorithm: SVD
Mean RMSE: 0.8737926799855067
-------
Algorithm: BaselineOnly
Mean RMSE: 0.9086413089573007
-------
Algorithm: KNNBasic
Mean RMSE: 0.9227537773868983
-------
Algorithm: KNNWithMeans
Mean RMSE: 0.9293471668748319
-------
Algorithm: KNNWithZScore
Mean RMSE: 0.930665043371406
-------
Algorithm: KNNBaseline
Mean RMSE: 0.8950032989274754
-------
Algorithm: NMF
Mean RMSE: 0.9162796975027663
-------
Algorithm: SlopeOne
Mean RMSE: 0.9067378582381183
-------
Algorithm: CoClustering
Mean RMSE: 0.9156466007593915
-------





In [185]:
pd.DataFrame(results).T.sort_values("mean_rmse").style.background_gradient(cmap="coolwarm")

NameError: name 'results' is not defined

Singular value decomposition (SVD) algorithm is the best algorithm for the given dataset.  
It is a matrix factorization technique that is usually very effective for recommender systems.  
It is a collaborative filtering algorithm that decomposes the user-item matrix by keeping the most important latent features that capture the majority of the item ratings.  
It is a popular algorithm for recommender systems because it can deal with the sparsity of the user-item matrix by extracting the most important latent features related to the users and items.  

Let's try to fine tune the SVD algorithm to get the best results.

In [9]:
param_grid = {
    # "n_epochs": [100],
    # "lr_all": [0.002],
    # "reg_all": [0.4],
    "n_factors": [39],
    "random_state": [42],
    "init_mean": [0.1],
    "init_std_dev": [0.045, 0.047],
    "verbose": [False],
}
gs_svd = GridSearchCV(algo_class=SVD, param_grid=param_grid, measures=["rmse", "mae"], cv=5, return_train_measures=False,
                  n_jobs=-1, joblib_verbose=100, refit=True, pre_dispatch='2*n_jobs',)

gs_svd.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    8.7s remaining:   35.0s
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    9.8s remaining:   22.8s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   10.1s remaining:   15.2s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   10.9s remaining:   10.9s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   11.7s remaining:    7.8s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   12.5s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:   13.3s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.1s finished


In [10]:
# best RMSE score
print(gs_svd.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs_svd.best_params["rmse"])

0.862735062956751
{'n_factors': 39, 'random_state': 42, 'init_mean': 0.1, 'init_std_dev': 0.047, 'verbose': False}


In [115]:
res = cross_validate(
    SVDpp(),                    # SVD is singular value decomposition (SVD) algorithm.
    data,                       # data to be used for cross-validation         
    verbose=True,               # print the performance metric
    cv=5,                       # 5-fold cross validation
    measures=["RMSE", "MAE"],
    n_jobs=-1,                  # use all available CPU cores for parallel processing.
    pre_dispatch="2*n_jobs",    # twice the number of available CPU cores.
    return_train_measures=False, # return the train error measures.
)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8607  0.8627  0.8626  0.8621  0.8584  0.8613  0.0016  
MAE (testset)     0.6714  0.6721  0.6727  0.6719  0.6691  0.6715  0.0012  
RMSE (trainset)   0.7581  0.7573  0.7590  0.7570  0.7573  0.7577  0.0007  
MAE (trainset)    0.5939  0.5932  0.5944  0.5930  0.5932  0.5936  0.0005  
Fit time          300.36  298.51  299.11  297.87  300.04  299.18  0.93    
Test time         58.61   60.41   58.85   59.09   61.06   59.60   0.96    


The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.  
Even without hyperparameter tuning, SVDpp is better than SVD on the MovieLens dataset.  

Let's try to fine tune the SVDpp algorithm to get the best results.


In [28]:
param_grid = {
    # "n_epochs": [100],
    # "lr_all": [0.002],
    # "reg_all": [0.4],
    "n_factors": [39],
    "random_state": [42],
    "init_mean": [0.1],
    "init_std_dev": [0.045],
    "verbose": [False],
}
gs_svdpp = GridSearchCV(algo_class=SVDpp, param_grid=param_grid, measures=["rmse", "mae"], cv=5, return_train_measures=False,
                  n_jobs=-1, joblib_verbose=100, refit=True, pre_dispatch='2*n_jobs',)

gs_svdpp.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


785.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.79s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.80s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.81s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.81s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.82s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.83s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.84s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.85s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
785.86s - pydevd: Sending message related to process being r

In [117]:
# best RMSE score
print(gs_svdpp.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs_svdpp.best_params["rmse"])

0.8588097586602299
{'n_factors': 39, 'random_state': 42, 'init_mean': 0.1, 'init_std_dev': 0.045, 'verbose': False}


First, we have to find a list of the movies that a particular user has not seen.

In [26]:
def give_recommendation(model, user_id, ratings, movies, n_items):
   # Get a list of all movie IDs from dataset
   movie_ids = movies["movieId"].unique()
 
   # Get a list of all movie IDs that have been watched by user
   movie_ids_user = ratings.loc[ratings["userId"] == user_id, "movieId"]
#    movie_ids_user = ratings[ratings["userId"] == user_id]["movieId"]

    # Get a list off all movie IDS that that have NOT been watched by user
    # returns an array containing the values in movie_ids that are not present in movie_ids_user.
   movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user) # setdiff1d: Find the set difference of two arrays.
 
   # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
   test_set = [[user_id, movie_id, 0] for movie_id in movie_ids_to_pred]
 
   # Predict the ratings and generate recommendations
   predictions = model.test(test_set)
   pred_ratings = np.array([pred.est for pred in predictions])
   print(f"Top {n_items} item recommendations for user {user_id}:")
   print(f"{'-' * 40}")
   # Rank top-n movies based on the predicted ratings
   # -pred_ratings negates all the predicted ratings. This is done because the argsort() function sorts the array 
   # in ascending order by default, but we want to find the items with the highest ratings, 
   # so we need to sort them in descending order.
   # [:n_items] slices the sorted indices to select only the first n_items indices, which represent the items 
   # with the highest predicted ratings.
   index_max = (-pred_ratings).argsort()[:n_items] 
   for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(f'{movies[movies["movieId"]==movie_id]["Title"].values[0]:<65}:{pred_ratings[i]:.2f}')
 


In [27]:
# define which user ID that we want to give recommendation
userID = 4169
# define how many top-n movies that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
give_recommendation(gs_svd,userID,ratings,movies,n_items)

Top 10 item recommendations for user 4169:
----------------------------------------
Toy Story (1995)                                                 :3.58
Twice Upon a Yesterday (1998)                                    :3.58
Loss of Sexual Innocence, The (1999)                             :3.58
Eternity and a Day (Mia eoniotita ke mia mera ) (1998)           :3.58
It Conquered the World (1956)                                    :3.58
Flying Saucer, The (1950)                                        :3.58
Howling II: Your Sister Is a Werewolf (1985)                     :3.58
Curse of Frankenstein, The (1957)                                :3.58
Dracula (1958)                                                   :3.58
Mummy's Ghost, The (1944)                                        :3.58


In [None]:
gs.predict(4169, 1)

In [23]:
movie_ids_user = ratings[ratings["userId"] == 4169]["movieId"]
# movies[movies["movieId"].isin(movie_ids_user)]["Title"]
movie_ids_user

695642    3789
695643     571
695644     574
695645     575
695646     577
          ... 
697951    3784
697952    3785
697953    2047
697954    3788
697955    2049
Name: movieId, Length: 2314, dtype: int64

In [18]:
movie_ids_user = ratings.loc[ratings["userId"] == 4169, "movieId"]
movie_ids_user

695642    3789
695643     571
695644     574
695645     575
695646     577
          ... 
697951    3784
697952    3785
697953    2047
697954    3788
697955    2049
Name: movieId, Length: 2314, dtype: int64

In [223]:
ratings[ratings["userId"] == 4169].sort_values(by="rating", ascending=False)

Unnamed: 0,userId,movieId,rating,timestamp
695642,4169,3789,5,965333672
697686,4169,2732,5,971579309
696096,4169,265,5,971582264
697334,4169,475,5,975803853
697335,4169,477,5,973310837
...,...,...,...,...
696591,4169,1924,1,971580320
696110,4169,1884,1,976588147
695927,4169,2450,1,971581624
697817,4169,519,1,978663954


In [219]:
ratings.groupby("userId")["rating"].count().sort_values(ascending=False).head(10)

userId
4169    2314
1680    1850
4277    1743
1941    1595
1181    1521
889     1518
3618    1344
2063    1323
1150    1302
1015    1286
Name: rating, dtype: int64

In [231]:
movies.shape

(3883, 3)