-
Notifications
You must be signed in to change notification settings - Fork 1k
/
split_data_for_unbiased_estimation.py
53 lines (40 loc) · 1.53 KB
/
split_data_for_unbiased_estimation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
This module descibes how to split a dataset into two parts A and B: A is for
tuning the algorithm parameters, and B is for having an unbiased estimation of
its performances. The tuning is done by Grid Search.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import random
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV
# Load the full dataset.
data = Dataset.load_builtin('ml-100k')
raw_ratings = data.raw_ratings
# shuffle ratings if you want
random.shuffle(raw_ratings)
# A = 90% of the data, B = 10% of the data
threshold = int(.9 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]
data.raw_ratings = A_raw_ratings # data is now the set A
# Select your best algo with grid search.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
grid_search.fit(data)
algo = grid_search.best_estimator['rmse']
# retrain on the whole set A
trainset = data.build_full_trainset()
algo.fit(trainset)
# Compute biased accuracy on A
predictions = algo.test(trainset.build_testset())
print('Biased accuracy on A,', end=' ')
accuracy.rmse(predictions)
# Compute unbiased accuracy on B
testset = data.construct_testset(B_raw_ratings) # testset is now the set B
predictions = algo.test(testset)
print('Unbiased accuracy on B,', end=' ')
accuracy.rmse(predictions)