-
Notifications
You must be signed in to change notification settings - Fork 1k
/
test_split.py
310 lines (243 loc) · 11.8 KB
/
test_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
from copy import copy
import numpy as np
from collections import Counter
import pytest
from six import itervalues
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import KFold
from surprise.model_selection import ShuffleSplit
from surprise.model_selection import RepeatedKFold
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise.model_selection import PredefinedKFold
from surprise.model_selection.split import get_cv
np.random.seed(1)
def test_KFold(toy_data):
# Test n_folds parameter
kf = KFold(n_splits=5)
assert len(list(kf.split(toy_data))) == 5
with pytest.raises(ValueError):
kf = KFold(n_splits=10)
next(kf.split(toy_data)) # Too big (greater than number of ratings)
with pytest.raises(ValueError):
kf = KFold(n_splits=1)
next(kf.split(toy_data)) # Too low (must be >= 2)
# Make sure data has not been shuffled. If not shuffled, the users in the
# testsets are 0, 1, 2... 4 (in that order).
kf = KFold(n_splits=5, shuffle=False)
users = [int(testset[0][0][-1]) for (_, testset) in kf.split(toy_data)]
assert users == list(range(5))
# Make sure that when called two times without shuffling, folds are the
# same.
kf = KFold(n_splits=5, shuffle=False)
testsets_a = [testset for (_, testset) in kf.split(toy_data)]
testsets_b = [testset for (_, testset) in kf.split(toy_data)]
assert testsets_a == testsets_b
# test once again with another KFold instance
kf = KFold(n_splits=5, shuffle=False)
testsets_a = [testset for (_, testset) in kf.split(toy_data)]
assert testsets_a == testsets_b
# We'll now shuffle b and check that folds are different.
# (this is conditioned by seed setting at the beginning of file)
kf = KFold(n_splits=5, random_state=None, shuffle=True)
testsets_b = [testset for (_, testset) in kf.split(toy_data)]
assert testsets_a != testsets_b
# test once again: two calls to kf.split make different splits when
# random_state=None
testsets_a = [testset for (_, testset) in kf.split(toy_data)]
assert testsets_a != testsets_b
# Make sure that folds are the same when same KFold instance is used with
# suffle is True but random_state is set to some value
kf = KFold(n_splits=5, random_state=1, shuffle=True)
testsets_a = [testset for (_, testset) in kf.split(toy_data)]
testsets_b = [testset for (_, testset) in kf.split(toy_data)]
assert testsets_a == testsets_b
# Make sure raw ratings are not shuffled by KFold
old_raw_ratings = copy(toy_data.raw_ratings)
kf = KFold(n_splits=5, shuffle=True)
next(kf.split(toy_data))
assert old_raw_ratings == toy_data.raw_ratings
def test_ShuffleSplit(toy_data):
with pytest.raises(ValueError):
ss = ShuffleSplit(n_splits=0)
with pytest.raises(ValueError):
ss = ShuffleSplit(test_size=10)
next(ss.split(toy_data))
with pytest.raises(ValueError):
ss = ShuffleSplit(train_size=10)
next(ss.split(toy_data))
with pytest.raises(ValueError):
ss = ShuffleSplit(test_size=3, train_size=3)
next(ss.split(toy_data))
with pytest.raises(ValueError):
ss = ShuffleSplit(test_size=3, train_size=0)
next(ss.split(toy_data))
with pytest.raises(ValueError):
ss = ShuffleSplit(test_size=0, train_size=3)
next(ss.split(toy_data))
# No need to cover the entire dataset
ss = ShuffleSplit(test_size=1, train_size=1)
next(ss.split(toy_data))
# test test_size to int and train_size to None (complement)
ss = ShuffleSplit(test_size=1)
assert all(len(testset) == 1 for (_, testset) in ss.split(toy_data))
assert all(trainset.n_ratings == 4 for (trainset, _) in ss.split(toy_data))
# test test_size to float and train_size to None (complement)
ss = ShuffleSplit(test_size=.2) # 20% of 5 = 1
assert all(len(testset) == 1 for (_, testset) in ss.split(toy_data))
assert all(trainset.n_ratings == 4 for (trainset, _) in ss.split(toy_data))
# test test_size to int and train_size to int
ss = ShuffleSplit(test_size=2, train_size=2)
assert all(len(testset) == 2 for (_, testset) in ss.split(toy_data))
assert all(trainset.n_ratings == 2 for (trainset, _) in ss.split(toy_data))
# test test_size to None (complement) and train_size to int
ss = ShuffleSplit(test_size=None, train_size=2)
assert all(len(testset) == 3 for (_, testset) in ss.split(toy_data))
assert all(trainset.n_ratings == 2 for (trainset, _) in ss.split(toy_data))
# test test_size to None (complement) and train_size to float
ss = ShuffleSplit(test_size=None, train_size=.2)
assert all(len(testset) == 4 for (_, testset) in ss.split(toy_data))
assert all(trainset.n_ratings == 1 for (trainset, _) in ss.split(toy_data))
# test default parameters: 5 splits, test_size = .2, train_size = None
ss = ShuffleSplit()
assert len(list(ss.split(toy_data))) == 5
assert all(len(testset) == 1 for (_, testset) in ss.split(toy_data))
assert all(trainset.n_ratings == 4 for (trainset, _) in ss.split(toy_data))
# Test random_state parameter
# If random_state is None, you get different split each time (conditioned
# by rng of course)
ss = ShuffleSplit(random_state=None)
testsets_a = [testset for (_, testset) in ss.split(toy_data)]
testsets_b = [testset for (_, testset) in ss.split(toy_data)]
assert testsets_a != testsets_b
# Repeated called to split when random_state is set lead to the same folds
ss = ShuffleSplit(random_state=1)
testsets_a = [testset for (_, testset) in ss.split(toy_data)]
testsets_b = [testset for (_, testset) in ss.split(toy_data)]
assert testsets_a == testsets_b
# Test shuffle parameter, if False then splits are the same regardless of
# random_state.
ss = ShuffleSplit(random_state=1, shuffle=False)
testsets_a = [testset for (_, testset) in ss.split(toy_data)]
testsets_b = [testset for (_, testset) in ss.split(toy_data)]
assert testsets_a == testsets_b
def test_train_test_split(toy_data):
# test test_size to int and train_size to None (complement)
trainset, testset = train_test_split(toy_data, test_size=2, train_size=None)
assert len(testset) == 2
assert trainset.n_ratings == 3
# test test_size to float and train_size to None (complement)
trainset, testset = train_test_split(toy_data, test_size=.2, train_size=None)
assert len(testset) == 1
assert trainset.n_ratings == 4
# test test_size to int and train_size to int
trainset, testset = train_test_split(toy_data, test_size=2, train_size=3)
assert len(testset) == 2
assert trainset.n_ratings == 3
# test test_size to None (complement) and train_size to int
trainset, testset = train_test_split(toy_data, test_size=None, train_size=2)
assert len(testset) == 3
assert trainset.n_ratings == 2
# test test_size to None (complement) and train_size to float
trainset, testset = train_test_split(toy_data, test_size=None, train_size=.2)
assert len(testset) == 4
assert trainset.n_ratings == 1
# Test random_state parameter
# If random_state is None, you get different split each time (conditioned
# by rng of course)
_, testset_a = train_test_split(toy_data, random_state=None)
_, testset_b = train_test_split(toy_data, random_state=None)
assert testset_a != testset_b
# Repeated called to split when random_state is set lead to the same folds
_, testset_a = train_test_split(toy_data, random_state=1)
_, testset_b = train_test_split(toy_data, random_state=1)
assert testset_a == testset_b
# Test shuffle parameter, if False then splits are the same regardless of
# random_state.
_, testset_a = train_test_split(toy_data, random_state=1, shuffle=None)
_, testset_b = train_test_split(toy_data, random_state=1, shuffle=None)
assert testset_a == testset_b
def test_RepeatedCV(toy_data):
# test n_splits and n_repeats parameters
rkf = RepeatedKFold(n_splits=3, n_repeats=2)
assert len(list(rkf.split(toy_data))) == 3 * 2
rkf = RepeatedKFold(n_splits=3, n_repeats=4)
assert len(list(rkf.split(toy_data))) == 3 * 4
rkf = RepeatedKFold(n_splits=4, n_repeats=3)
assert len(list(rkf.split(toy_data))) == 4 * 3
# Make sure folds different between 2 repetitions (even if
# random_state is set, random_state controls the whole sequence)
rkf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=3)
testsets = list(testset for (_, testset) in rkf.split(toy_data))
for i in range(3):
assert testsets[i] != testsets[i + 3]
# Make sure folds are same when same cv iterator is called on same data (if
# random_state is set)
rkf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=3)
testsets_a = list(testset for (_, testset) in rkf.split(toy_data))
testsets_b = list(testset for (_, testset) in rkf.split(toy_data))
assert testsets_a == testsets_b
# Make sure folds are different when random_state is None
rkf = RepeatedKFold(n_splits=3, n_repeats=2, random_state=None)
testsets_a = list(testset for (_, testset) in rkf.split(toy_data))
testsets_b = list(testset for (_, testset) in rkf.split(toy_data))
assert testsets_a != testsets_b
def test_LeaveOneOut(toy_data):
loo = LeaveOneOut()
with pytest.raises(ValueError):
next(loo.split(toy_data)) # each user only has 1 item so trainsets fail
reader = Reader('ml-100k')
data_path = (os.path.dirname(os.path.realpath(__file__)) +
'/u1_ml100k_test')
data = Dataset.load_from_file(file_path=data_path, reader=reader)
# Test random_state parameter
# If random_state is None, you get different split each time (conditioned
# by rng of course)
loo = LeaveOneOut(random_state=None)
testsets_a = [testset for (_, testset) in loo.split(data)]
testsets_b = [testset for (_, testset) in loo.split(data)]
assert testsets_a != testsets_b
# Repeated called to split when random_state is set lead to the same folds
loo = LeaveOneOut(random_state=1)
testsets_a = [testset for (_, testset) in loo.split(data)]
testsets_b = [testset for (_, testset) in loo.split(data)]
assert testsets_a == testsets_b
# Make sure only one rating per user is present in the testset
loo = LeaveOneOut()
for _, testset in loo.split(data):
cnt = Counter([uid for (uid, _, _) in testset])
assert all(val == 1 for val in itervalues(cnt))
# test the min_n_ratings parameter
loo = LeaveOneOut(min_n_ratings=5)
for trainset, _ in loo.split(data):
assert all(len(ratings) >= 5 for ratings in itervalues(trainset.ur))
loo = LeaveOneOut(min_n_ratings=10)
for trainset, _ in loo.split(data):
assert all(len(ratings) >= 10 for ratings in itervalues(trainset.ur))
loo = LeaveOneOut(min_n_ratings=10000) # too high
with pytest.raises(ValueError):
next(loo.split(data))
def test_PredifinedKFold():
reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
rating_scale=(1, 5))
current_dir = os.path.dirname(os.path.realpath(__file__))
folds_files = [(current_dir + '/custom_train',
current_dir + '/custom_test')]
data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
# Make sure rating files are read correctly
pkf = PredefinedKFold()
trainset, testset = next(pkf.split(data))
assert trainset.n_ratings == 6
assert len(testset) == 3
def test_get_cv():
get_cv(None)
get_cv(4)
get_cv(KFold())
with pytest.raises(ValueError):
get_cv(23.2)
with pytest.raises(ValueError):
get_cv('bad')