<a href="https://colab.research.google.com/github/Hristo2076/RecSys/blob/main/RecSys_SURPRISE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Пакет SURPRISE:**

1. используйте данные MovieLens 1M,
2. можно использовать любые модели из пакета,
3. получите RMSE на тестовом сете 0,87 и ниже.

In [None]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF

from surprise import SlopeOne, CoClustering
from surprise import accuracy
from surprise.model_selection import train_test_split

In [None]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')[:500000]

In [None]:
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,4.0,1113766000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,5.0,948885800.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.5,1442169000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,4.0,1370810000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22,4.0,1237623000.0


In [None]:
movies_with_ratings.shape

(500000, 6)

In [None]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [None]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=1)

In [None]:
algo = KNNWithMeans(k=50, sim_options={
    'name': 'cosine',
    'user_based': True  # compute  similarities between users
})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f787766b9d0>

In [None]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9010


0.901041745499666

In [None]:
algo.predict(uid=2, iid='Fight Club (1999)')

Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=4.306373599711982, details={'actual_k': 50, 'was_impossible': False})

In [None]:
benchmark = []


algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    

    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    
    
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f7873d30eb0>, <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f7873d33700>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7f7873d33610>, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7f7873d33640>, <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7f7873d33670>, <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f7873d32140>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x7f7873d32ef0>, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7f7873d31030>, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7f7873d31090>, <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x7f7873d32920>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7f7873d32ec0>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f7873d

  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f7873d30eb0> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f7873d33700>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f7873d33700> 


Starting:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7f7873d33610>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7f7873d33610> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7f7873d33640>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7f7873d33640> 


Starting:  <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7f7873d33670>


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7f7873d33670> 


Starting:  <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f7873d32140>
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7f7873d32140> 


Starting:  <surprise.prediction_algorithms.knns.KNNBasic object at 0x7f7873d32ef0>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNBasic object at 0x7f7873d32ef0> 


Starting:  <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7f7873d31030>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7f7873d31030> 


Starting:  <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7f7873d31090>
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7f7873d31090> 


Starting:  <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x7f7873d32920>
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Done:  <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x7f7873d32920> 


Starting:  <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7f7873d32ec0>
Done:  <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7f7873d32ec0> 



	DONE



  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


In [None]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.851476,615.860782,65.236866
SVD,0.865239,9.749089,1.316486
KNNBaseline,0.875904,7.383131,28.045306
BaselineOnly,0.883924,2.698687,1.299993
SlopeOne,0.895093,30.072393,47.37772
KNNWithMeans,0.898221,6.333335,28.664313
KNNWithZScore,0.89848,6.278623,29.59177
NMF,0.907966,15.353377,1.487821
CoClustering,0.931438,14.136545,1.437618
KNNBasic,0.933817,5.669361,28.174969
