/
test_dataset.py
246 lines (193 loc) · 8.31 KB
/
test_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""
Module for testing the Dataset class.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import random
import pytest
import pandas as pd
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.builtin_datasets import get_dataset_dir
random.seed(1)
def test_wrong_file_name():
"""Ensure file names are checked when creating a (custom) Dataset."""
wrong_files = [('does_not_exist', 'does_not_either')]
with pytest.raises(ValueError):
Dataset.load_from_folds(folds_files=wrong_files, reader=Reader(),
rating_scale=(1, 5))
def test_build_full_trainset(toy_data):
"""Test the build_full_trainset method."""
trainset = toy_data.build_full_trainset()
assert len(trainset.ur) == 5
assert len(trainset.ir) == 2
assert trainset.n_users == 5
assert trainset.n_items == 2
def test_no_call_to_split(toy_data):
"""Ensure, as mentioned in the split() docstring, that even if split is not
called then the data is split with 5 folds after being shuffled."""
with pytest.warns(UserWarning):
assert len(list(toy_data.folds())) == 5
# make sure data has been shuffled. If not shuffled, the users in the
# testsets would be 0, 1, 2... 4 (in that order).
with pytest.warns(UserWarning):
users = [int(testset[0][0][-1])
for (_, testset) in toy_data.folds()]
assert users != list(range(5))
def test_split(toy_data):
"""Test the split method."""
# Test the shuffle parameter
# Make sure data has not been shuffled. If not shuffled, the users in the
# testsets are 0, 1, 2... 4 (in that order).
with pytest.warns(UserWarning):
toy_data.split(n_folds=5, shuffle=False)
users = [int(testset[0][0][-1])
for (_, testset) in toy_data.folds()]
assert users == list(range(5))
# Test the shuffle parameter
# Make sure that when called two times without shuffling, folds are the
# same.
with pytest.warns(UserWarning):
toy_data.split(n_folds=3, shuffle=False)
testsets_a = [testset for (_, testset) in toy_data.folds()]
toy_data.split(n_folds=3, shuffle=False)
testsets_b = [testset for (_, testset) in toy_data.folds()]
assert testsets_a == testsets_b
# We'll now shuffle b and check that folds are different.
with pytest.warns(UserWarning):
toy_data.split(n_folds=3, shuffle=True)
testsets_b = [testset for (_, testset) in toy_data.folds()]
assert testsets_a != testsets_b
# Ensure that folds are the same if split is not called again
with pytest.warns(UserWarning):
testsets_a = [testset for (_, testset) in toy_data.folds()]
testsets_b = [testset for (_, testset) in toy_data.folds()]
assert testsets_a == testsets_b
# Test n_folds parameter
with pytest.warns(UserWarning):
toy_data.split(5)
assert len(list(toy_data.folds())) == 5
with pytest.raises(ValueError):
toy_data.split(10) # Too big (greater than number of ratings)
with pytest.raises(ValueError):
toy_data.split(1) # Too low (must be >= 2)
def test_trainset_testset(toy_data_reader):
"""Test the construct_trainset and construct_testset methods."""
current_dir = os.path.dirname(os.path.realpath(__file__))
folds_files = [(current_dir + '/custom_train',
current_dir + '/custom_test')]
data = Dataset.load_from_folds(folds_files=folds_files,
reader=toy_data_reader, rating_scale=(1, 5))
with pytest.warns(UserWarning):
trainset, testset = next(data.folds())
# test ur
ur = trainset.ur
assert ur[0] == [(0, 4)]
assert ur[1] == [(0, 4), (1, 2)]
assert ur[40] == [] # not in the trainset
# test ir
ir = trainset.ir
assert ir[0] == [(0, 4), (1, 4), (2, 1)]
assert ir[1] == [(1, 2), (2, 1), (3, 5)]
assert ir[20000] == [] # not in the trainset
# test n_users, n_items, n_ratings, rating_scale
assert trainset.n_users == 4
assert trainset.n_items == 2
assert trainset.n_ratings == 6
assert trainset.rating_scale == (1, 5)
# test raw2inner
for i in range(4):
assert trainset.to_inner_uid('user' + str(i)) == i
with pytest.raises(ValueError):
trainset.to_inner_uid('unkown_user')
for i in range(2):
assert trainset.to_inner_iid('item' + str(i)) == i
with pytest.raises(ValueError):
trainset.to_inner_iid('unkown_item')
# test inner2raw
assert trainset._inner2raw_id_users is None
assert trainset._inner2raw_id_items is None
for i in range(4):
assert trainset.to_raw_uid(i) == 'user' + str(i)
for i in range(2):
assert trainset.to_raw_iid(i) == 'item' + str(i)
assert trainset._inner2raw_id_users is not None
assert trainset._inner2raw_id_items is not None
# Test the build_testset() method
algo = BaselineOnly()
algo.fit(trainset)
testset = trainset.build_testset()
algo.test(testset) # ensure an algorithm can manage the data
assert ('user0', 'item0', 4) in testset
assert ('user3', 'item1', 5) in testset
assert ('user3', 'item1', 0) not in testset
# Test the build_anti_testset() method
algo = BaselineOnly()
algo.fit(trainset)
testset = trainset.build_anti_testset()
algo.test(testset) # ensure an algorithm can manage the data
assert ('user0', 'item0', trainset.global_mean) not in testset
assert ('user3', 'item1', trainset.global_mean) not in testset
assert ('user0', 'item1', trainset.global_mean) in testset
assert ('user3', 'item0', trainset.global_mean) in testset
def test_load_form_df():
"""Ensure reading dataset from pandas dataframe is OK."""
# DF creation.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
'userID': [9, 32, 2, 45, '10000'],
'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
rating_scale=(1, 5))
# Assert split and folds can be used without problems
with pytest.warns(UserWarning):
data.split(2)
assert sum(1 for _ in data.folds()) == 2
# assert users and items are correctly mapped
trainset = data.build_full_trainset()
assert trainset.knows_user(trainset.to_inner_uid(9))
assert trainset.knows_user(trainset.to_inner_uid('10000'))
assert trainset.knows_item(trainset.to_inner_iid(2))
# assert r(9, 1) = 3 and r(2, 1) = 4
uid9 = trainset.to_inner_uid(9)
uid2 = trainset.to_inner_uid(2)
iid1 = trainset.to_inner_iid(1)
assert trainset.ur[uid9] == [(iid1, 3)]
assert trainset.ur[uid2] == [(iid1, 4)]
# mess up the column ordering and assert that users are not correctly
# mapped
data = Dataset.load_from_df(df[['rating', 'itemID', 'userID']],
rating_scale=(1, 5))
trainset = data.build_full_trainset()
with pytest.raises(ValueError):
trainset.to_inner_uid('10000')
def test_build_anti_testset():
ratings_dict = {'itemID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'userID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
'rating': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
df = pd.DataFrame(ratings_dict)
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
rating_scale=(1, 5))
with pytest.warns(UserWarning):
data.split(2)
trainset, __testset = next(data.folds())
# fill with some specific value
for fillvalue in (0, 42., -1):
anti = trainset.build_anti_testset(fill=fillvalue)
for (u, i, r) in anti:
assert r == fillvalue
# fill with global_mean
anti = trainset.build_anti_testset(fill=None)
for (u, i, r) in anti:
assert r == trainset.global_mean
expect = trainset.n_users * trainset.n_items
assert trainset.n_ratings + len(anti) == expect
def test_get_dataset_dir():
'''Test the get_dataset_dir() function.'''
os.environ['SURPRISE_DATA_FOLDER'] = '/tmp/surprise_data'
assert get_dataset_dir() == '/tmp/surprise_data'
# Fall back to default
del os.environ['SURPRISE_DATA_FOLDER']
assert get_dataset_dir() == os.path.expanduser('~' + '/.surprise_data/')