In [18]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [4]:
ratings.count()

userId       27753444
movieId      27753444
rating       27753444
timestamp    27753444
dtype: int64

In [5]:
ratings_1 = ratings[0:1000000]

In [6]:
movies_with_ratings = movies.join(ratings_1.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [7]:
movies_with_ratings[movies_with_ratings.userId == 5.0].title.unique()

array(['Seven (a.k.a. Se7en) (1995)', 'Usual Suspects, The (1995)',
       'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
       'Pulp Fiction (1994)', 'Shawshank Redemption, The (1994)',
       "Schindler's List (1993)", 'Cold Comfort Farm (1995)',
       'Trainspotting (1996)', 'Godfather, The (1972)',
       'When We Were Kings (1996)', 'Sex, Lies, and Videotape (1989)',
       "One Flew Over the Cuckoo's Nest (1975)", 'Goodfellas (1990)',
       'Full Metal Jacket (1987)', 'L.A. Confidential (1997)',
       'Big Lebowski, The (1998)', 'BASEketball (1998)',
       'American History X (1998)', "She's All That (1999)",
       'Office Space (1999)', 'American Beauty (1999)',
       'Fight Club (1999)', 'Any Given Sunday (1999)',
       'Hurricane, The (1999)', 'Requiem for a Dream (2000)',
       'Snatch (2000)', 'Memento (2000)',
       "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",
       'Super Troopers (2001)', 'Scratch (2001)', 'One Hour Photo (2002)',


In [8]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'title': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [9]:
dataset.head()

Unnamed: 0,uid,title,rating
0,4.0,Toy Story (1995),4.0
1,10.0,Toy Story (1995),5.0
2,14.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),4.0
4,22.0,Toy Story (1995),4.0


In [10]:
ratings.rating.min()

0.5

In [11]:
ratings.rating.max()

5.0

In [12]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [13]:
data.df

Unnamed: 0,uid,title,rating
0,4.0,Toy Story (1995),4.0
1,10.0,Toy Story (1995),5.0
2,14.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),4.0
4,22.0,Toy Story (1995),4.0
...,...,...,...
1035938,2768.0,The Land of Steady Habits (2018),3.5
1035940,2069.0,Climax (2018),4.5
1036022,4871.0,Empowered (2018),2.0
1036046,4226.0,Raazi (2018),4.0


In [14]:
trainset, testset = train_test_split(data, test_size=.15)

In [15]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1d3077d0f70>

Если использовать не урезанный датасет - то падает с ошибкой на данном шаге


MemoryError: Unable to allocate 297. GiB for an array with shape (282253, 282253) and data type int32

In [16]:
test_pred = algo.test(testset)

In [17]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8649


0.8649143204011098

In [19]:
kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8681
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8682
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8666
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8711
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8698


In [22]:
from surprise import SVD
algo_SVD = SVD()

kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    algo_SVD.fit(trainset)
    predictions = algo_SVD.test(testset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8529
RMSE: 0.8511
RMSE: 0.8512
RMSE: 0.8511
RMSE: 0.8491
