/
trainset.py
265 lines (204 loc) · 8.32 KB
/
trainset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""This module contains the Trainset class."""
import numpy as np
class Trainset:
"""A trainset contains all useful data that constitute a training set.
It is used by the :meth:`fit()
<surprise.prediction_algorithms.algo_base.AlgoBase.fit>` method of every
prediction algorithm. You should not try to build such an object on your
own but rather use the :meth:`Dataset.folds()
<surprise.dataset.Dataset.folds>` method or the
:meth:`DatasetAutoFolds.build_full_trainset()
<surprise.dataset.DatasetAutoFolds.build_full_trainset>` method.
Trainsets are different from :class:`Datasets <surprise.dataset.Dataset>`.
You can think of a :class:`Dataset <surprise.dataset.Dataset>` as the raw
data, and Trainsets as higher-level data where useful methods are defined.
Also, a :class:`Dataset <surprise.dataset.Dataset>` may be comprised of
multiple Trainsets (e.g. when doing cross validation).
Attributes:
ur(:obj:`defaultdict` of :obj:`list`): The users ratings. This is a
dictionary containing lists of tuples of the form ``(item_inner_id,
rating)``. The keys are user inner ids.
ir(:obj:`defaultdict` of :obj:`list`): The items ratings. This is a
dictionary containing lists of tuples of the form ``(user_inner_id,
rating)``. The keys are item inner ids.
n_users: Total number of users :math:`|U|`.
n_items: Total number of items :math:`|I|`.
n_ratings: Total number of ratings :math:`|R_{train}|`.
rating_scale(tuple): The minimum and maximal rating of the rating
scale.
global_mean: The mean of all ratings :math:`\\mu`.
"""
def __init__(
self,
ur,
ir,
n_users,
n_items,
n_ratings,
rating_scale,
raw2inner_id_users,
raw2inner_id_items,
):
self.ur = ur
self.ir = ir
self.n_users = n_users
self.n_items = n_items
self.n_ratings = n_ratings
self.rating_scale = rating_scale
self._raw2inner_id_users = raw2inner_id_users
self._raw2inner_id_items = raw2inner_id_items
self._global_mean = None
# inner2raw dicts could be built right now (or even before) but they
# are not always useful so we wait until we need them.
self._inner2raw_id_users = None
self._inner2raw_id_items = None
def knows_user(self, uid):
"""Indicate if the user is part of the trainset.
A user is part of the trainset if the user has at least one rating.
Args:
uid(int): The (inner) user id. See :ref:`this
note<raw_inner_note>`.
Returns:
``True`` if user is part of the trainset, else ``False``.
"""
return uid in self.ur
def knows_item(self, iid):
"""Indicate if the item is part of the trainset.
An item is part of the trainset if the item was rated at least once.
Args:
iid(int): The (inner) item id. See :ref:`this
note<raw_inner_note>`.
Returns:
``True`` if item is part of the trainset, else ``False``.
"""
return iid in self.ir
def to_inner_uid(self, ruid):
"""Convert a **user** raw id to an inner id.
See :ref:`this note<raw_inner_note>`.
Args:
ruid(str): The user raw id.
Returns:
int: The user inner id.
Raises:
ValueError: When user is not part of the trainset.
"""
try:
return self._raw2inner_id_users[ruid]
except KeyError:
raise ValueError("User " + str(ruid) + " is not part of the trainset.")
def to_raw_uid(self, iuid):
"""Convert a **user** inner id to a raw id.
See :ref:`this note<raw_inner_note>`.
Args:
iuid(int): The user inner id.
Returns:
str: The user raw id.
Raises:
ValueError: When ``iuid`` is not an inner id.
"""
if self._inner2raw_id_users is None:
self._inner2raw_id_users = {
inner: raw for (raw, inner) in self._raw2inner_id_users.items()
}
try:
return self._inner2raw_id_users[iuid]
except KeyError:
raise ValueError(str(iuid) + " is not a valid inner id.")
def to_inner_iid(self, riid):
"""Convert an **item** raw id to an inner id.
See :ref:`this note<raw_inner_note>`.
Args:
riid(str): The item raw id.
Returns:
int: The item inner id.
Raises:
ValueError: When item is not part of the trainset.
"""
try:
return self._raw2inner_id_items[riid]
except KeyError:
raise ValueError("Item " + str(riid) + " is not part of the trainset.")
def to_raw_iid(self, iiid):
"""Convert an **item** inner id to a raw id.
See :ref:`this note<raw_inner_note>`.
Args:
iiid(int): The item inner id.
Returns:
str: The item raw id.
Raises:
ValueError: When ``iiid`` is not an inner id.
"""
if self._inner2raw_id_items is None:
self._inner2raw_id_items = {
inner: raw for (raw, inner) in self._raw2inner_id_items.items()
}
try:
return self._inner2raw_id_items[iiid]
except KeyError:
raise ValueError(str(iiid) + " is not a valid inner id.")
def all_ratings(self):
"""Generator function to iterate over all ratings.
Yields:
A tuple ``(uid, iid, rating)`` where ids are inner ids (see
:ref:`this note <raw_inner_note>`).
"""
for u, u_ratings in self.ur.items():
for i, r in u_ratings:
yield u, i, r
def build_testset(self):
"""Return a list of ratings that can be used as a testset in the
:meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
method.
The ratings are all the ratings that are in the trainset, i.e. all the
ratings returned by the :meth:`all_ratings()
<surprise.Trainset.all_ratings>` generator. This is useful in
cases where you want to to test your algorithm on the trainset.
"""
return [
(self.to_raw_uid(u), self.to_raw_iid(i), r)
for (u, i, r) in self.all_ratings()
]
def build_anti_testset(self, fill=None):
"""Return a list of ratings that can be used as a testset in the
:meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
method.
The ratings are all the ratings that are **not** in the trainset, i.e.
all the ratings :math:`r_{ui}` where the user :math:`u` is known, the
item :math:`i` is known, but the rating :math:`r_{ui}` is not in the
trainset. As :math:`r_{ui}` is unknown, it is either replaced by the
:code:`fill` value or assumed to be equal to the mean of all ratings
:meth:`global_mean <surprise.Trainset.global_mean>`.
Args:
fill(float): The value to fill unknown ratings. If :code:`None` the
global mean of all ratings :meth:`global_mean
<surprise.Trainset.global_mean>` will be used.
Returns:
A list of tuples ``(uid, iid, fill)`` where ids are raw ids.
"""
fill = self.global_mean if fill is None else float(fill)
anti_testset = []
for u in self.all_users():
user_items = {j for (j, _) in self.ur[u]}
anti_testset += [
(self.to_raw_uid(u), self.to_raw_iid(i), fill)
for i in self.all_items()
if i not in user_items
]
return anti_testset
def all_users(self):
"""Generator function to iterate over all users.
Yields:
Inner id of users.
"""
return range(self.n_users)
def all_items(self):
"""Generator function to iterate over all items.
Yields:
Inner id of items.
"""
return range(self.n_items)
@property
def global_mean(self):
if self._global_mean is None:
self._global_mean = np.mean([r for (_, _, r) in self.all_ratings()])
return self._global_mean