In [32]:
from surprise import KNNWithMeans, KNNBasic, SVD, NMF, SVDpp, NormalPredictor
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise.model_selection import GridSearchCV

import pandas as pd

In [2]:
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Shawshank Redemption, The (1994)', 'Tommy Boy (1995)',
       'Good Will Hunting (1997)', 'Gladiator (2000)',
       'Kill Bill: Vol. 1 (2003)', 'Collateral (2004)',
       'Talladega Nights: The Ballad of Ricky Bobby (2006)',
       'Departed, The (2006)', 'Dark Knight, The (2008)',
       'Step Brothers (2008)', 'Inglourious Basterds (2009)',
       'Zombieland (2009)', 'Shutter Island (2010)',
       'Exit Through the Gift Shop (2010)', 'Inception (2010)',
       'Town, The (2010)', 'Inside Job (2010)',
       'Louis C.K.: Hilarious (2010)', 'Warrior (2011)',
       'Dark Knight Rises, The (2012)',
       'Girl with the Dragon Tattoo, The (2011)',
       'Django Unchained (2012)', 'Wolf of Wall Street, The (2013)',
       'Interstellar (2014)', 'Whiplash (2014)', 'The Drop (2014)',
       'Ex Machina (2015)', 'Mad Max: Fury Road (2015)',
       'The Jinx: The Life and Deaths of Robert Durst (2015)'],
      dtype=object)

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [8]:
ratings.rating.min()

0.5

In [9]:
ratings.rating.max()

5.0

In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [11]:
trainset, testset = train_test_split(data, test_size=.15)

In [14]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1d8305782e8>

In [15]:
test_pred = algo.test(testset)

In [16]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8924


0.8923569997897458

In [17]:
algo.predict(uid=2, iid='Fight Club (1999)')

Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=4.480345079621131, details={'actual_k': 50, 'was_impossible': False})

## Домашнее задание

In [25]:
algo1 = SVD()
algo1.fit(trainset)
test_pred1 = algo1.test(testset)
accuracy.rmse(test_pred1, verbose=True)

RMSE: 0.8670


0.8670404945635107

In [24]:
#попробуем кросс валидацию с SVD
cross_validate(algo1, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8751  0.8694  0.8747  0.8761  0.8747  0.8740  0.0024  
Fit time          13.38   11.31   11.51   11.76   12.10   12.01   0.73    
Test time         0.27    0.29    0.50    0.28    0.50    0.37    0.11    


{'test_rmse': array([0.87513183, 0.86936928, 0.87466725, 0.87610564, 0.87470908]),
 'fit_time': (13.375323295593262,
  11.312082290649414,
  11.507849931716919,
  11.756431579589844,
  12.098047018051147),
 'test_time': (0.26527953147888184,
  0.28616929054260254,
  0.49651336669921875,
  0.2815995216369629,
  0.497769832611084)}

In [20]:
#попробуем GridSearch
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

gs.fit(data)

print(gs.best_score['rmse'])

0.8937495820210533


In [21]:
algo2 = gs.best_estimator['rmse']
algo2.fit(trainset)
test_pred2 = algo2.test(testset)
accuracy.rmse(test_pred2, verbose=True)

RMSE: 0.8856


0.8855905317230283

In [26]:
#немного удивительно что обычный SVD без параметров, кросс валидации и т.п. даёт чуть меньшую ошибку

In [31]:
algo3 = SVDpp()
algo3.fit(trainset)
test_pred3 = algo3.test(testset)
accuracy.rmse(test_pred3, verbose=True)

RMSE: 0.8513


0.8512827437765051

# SVD++ позволил ещё снизить ошибку, однако заняло это около 15 минут на компьютере с core i7 и 20 гигами оперативки, что наверное не шибко хорошо для "урезанного" датасета,

In [33]:
algo4 = NormalPredictor()
algo4.fit(trainset)
test_pred4 = algo4.test(testset)
accuracy.rmse(test_pred4, verbose=True)

RMSE: 1.4299


1.4299307080533372