This repository has been archived by the owner on Jan 31, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
utils.py
226 lines (194 loc) · 6.82 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""
Utility functions
"""
import pickle
import time
from warnings import warn
from distutils.version import LooseVersion
CONTAINERS = (list, dict, tuple)
TYPES = (int, float, str, bool, type)
MIN_VERSION = "0.20"
def check_types(obj, containers=CONTAINERS, types=TYPES):
"""
Checks if input object is an allowed type. Objects can be
acceptable containers or acceptable types themselves.
Containers are checked recursively to ensure all contained
types are valid. If object is a `pure_sklearn` type, its
attributes are all recursively checked.
"""
if isinstance(obj, containers):
if isinstance(obj, (list, tuple)):
for ob in obj:
check_types(ob)
else:
for k, v in obj.items():
check_types(k)
check_types(v)
elif isinstance(obj, types):
pass
elif "pure_sklearn" in str(type(obj)):
for attr in vars(obj):
check_types(getattr(obj, attr))
elif obj is None:
pass
else:
raise ValueError("Object contains invalid type: {}".format(type(obj)))
def check_version(estimator, min_version=None):
""" Checks the version of the scikit-learn estimator """
warning_str = (
"Estimators fitted with sklearn version < {} are not guaranteed to work".format(
MIN_VERSION
)
)
try:
version_ = estimator.__getstate__()["_sklearn_version"]
except:
warn(warning_str)
return
if (min_version is not None) and (
LooseVersion(version_) < LooseVersion(min_version)
):
raise Exception(
"The sklearn version is too low for this estimator; must be >= {}".format(
min_version
)
)
elif LooseVersion(version_) < LooseVersion(MIN_VERSION):
warn(warning_str)
def convert_type(dtype):
""" Converts a datatype to its pure python equivalent """
val = dtype(0)
if hasattr(val, "item"):
return type(val.item())
else:
return dtype
def check_array(X, handle_sparse="error"):
"""
Checks if array is compatible for prediction with
`pure_sklearn` classes. Input 'X' should be a non-empty
`list` or `sparse_list`. If 'X' is sparse, flexible
sparse handling is applied, allowing sparse by default,
or optionally erroring on sparse input.
"""
if issparse(X):
if handle_sparse == "allow":
return X
elif handle_sparse == "error":
raise ValueError("Sparse input is not supported " "for this estimator")
else:
raise ValueError(
"Invalid value for 'handle_sparse' "
"input. Acceptable values are 'allow' or 'error'"
)
if not isinstance(X, list):
raise TypeError("Input 'X' must be a list")
if len(X) == 0:
return ValueError("Input 'X' must not be empty")
return X
def shape(X):
"""
Checks the shape of input list. Similar to
numpy `ndarray.shape()`. Handles `list` or
`sparse_list` input.
"""
if ndim(X) == 1:
return (len(X),)
elif ndim(X) == 2:
if issparse(X):
return (len(X), X.size)
else:
return (len(X), len(X[0]))
def ndim(X):
""" Computes the dimension of input list """
if isinstance(X[0], (list, dict)):
return 2
else:
return 1
def tosparse(A):
""" Converts input dense list to a `sparse_list` """
return sparse_list(A)
def todense(A):
""" Converts input `sparse_list` to a dense list """
return A.todense()
def issparse(A):
""" Checks if input list is a `sparse_list` """
return isinstance(A, sparse_list)
class sparse_list(list):
"""
Pure python implementation of a 2-D sparse data structure.
The data structure is a list of dictionaries. Each dictionary
represents a 'row' of data. The dictionary keys correspond to the
indices of 'columns' and the dictionary values correspond to the
data value associated with that index. Missing keys are assumed
to have values of 0.
Args:
A (list): 2-D list of lists or list of dicts
size (int): Number of 'columns' of the data structure
dtype (type): Data type of data values
Examples:
>>> A = [[0,1,0], [0,1,1]]
>>> print(sparse_list(A))
... [{1:1}, {2:1, 3:1}]
>>>
>>> B = [{3:0.5}, {1:0.9, 10:0.2}]
>>> print(sparse_list(B, size=11, dtype=float))
... [{3:0.5}, {1:0.9, 10:0.2}]
"""
def __init__(self, A, size=None, dtype=None):
if isinstance(A[0], dict):
self.dtype = float if dtype is None else dtype
self.size = size
for row in A:
self.append(row)
else:
A = check_array(A)
self.size = shape(A)[1]
self.dtype = type(A[0][0])
for row in A:
self.append(
dict([(i, row[i]) for i in range(self.size) if row[i] != 0])
)
def todense(self):
""" Converts `sparse_list` instance to a dense list """
A_dense = []
zero_val = self.dtype(0)
for row in self:
A_dense.append([row.get(i, zero_val) for i in range(self.size)])
return A_dense
def performance_comparison(sklearn_estimator, pure_sklearn_estimator, X):
"""
Profile performance characteristics between sklearn estimator and
corresponding pure-predict estimator.
Args:
sklearn_estimator (object)
pure_sklearn_estimator (object)
X (numpy ndarray): features for prediction
"""
### -- profile pickled object size: sklearn vs pure-predict
pickled = pickle.dumps(sklearn_estimator)
pickled_ = pickle.dumps(pure_sklearn_estimator)
print("Pickle Size sklearn: {}".format(len(pickled)))
print("Pickle Size pure-predict: {}".format(len(pickled_)))
print("Difference: {}".format(len(pickled_) / float(len(pickled))))
### -- profile unpickle time: sklearn vs pure-predict
start = time.time()
_ = pickle.loads(pickled)
pickle_t = time.time() - start
print("Unpickle time sklearn: {}".format(pickle_t))
start = time.time()
_ = pickle.loads(pickled_)
pickle_t_ = time.time() - start
print("Unpickle time pure-predict: {}".format(pickle_t_))
print("Difference: {}".format(pickle_t_ / pickle_t))
### -- profile single record predict latency: sklearn vs pure-predict
X_pred = X[:1]
X_pred_ = X_pred if isinstance(X_pred, list) else X_pred.tolist()
start = time.time()
_ = sklearn_estimator.predict(X_pred)
pred_t = time.time() - start
print("Predict 1 record sklearn: {}".format(pred_t))
start = time.time()
_ = pure_sklearn_estimator.predict(X_pred_)
pred_t_ = time.time() - start
print("Predict 1 record pure-predict: {}".format(pred_t_))
print("Difference: {}".format(pred_t_ / pred_t))