Skip to content

Commit

Permalink
GridSearchCV now allows return_train_measures
Browse files Browse the repository at this point in the history
  • Loading branch information
NicolasHug committed Jan 7, 2018
1 parent 58fc29e commit 112c8f9
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 13 deletions.
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ TODO
* Update README example before new rewlease, as well as computation times
* grid search should allow the refit param and the test method using the best
estimator
* Grid search and cross_validate should allow return_train_score

* check conda forge
* make some filtering dataset tools, like remove users/items with less/more
Expand All @@ -15,6 +14,7 @@ TODO
Done:
-----

* Grid search and cross_validate now allow return_train_score
* Make all fit methods return self. Update docs on building custom algorithms
* Update doc of MF algo to indicate how to retrieve latent factors.
* all algorithms using random initialization now have a random_state parameter.
Expand Down
40 changes: 29 additions & 11 deletions surprise/model_selection/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class GridSearchCV:
appropriate ``n_splits`` parameter. If ``None``, :class:`KFold
<surprise.model_selection.split.KFold>` is used with
``n_splits=5``.
return_train_measures(bool): Whether to compute performance measures on
the trainsets. If ``True``, the ``cv_results`` attribute will
also contain measures for trainsets. Default is ``False``.
n_jobs(int): The maximum number of parallel training procedures.
- If ``-1``, all CPUs are used.
Expand Down Expand Up @@ -89,13 +92,14 @@ class GridSearchCV:
'''

def __init__(self, algo_class, param_grid, measures=['rmse', 'mae'],
cv=None, n_jobs=-1, pre_dispatch='2*n_jobs',
joblib_verbose=0):
cv=None, return_train_measures=False, n_jobs=-1,
pre_dispatch='2*n_jobs', joblib_verbose=0):

self.algo_class = algo_class
self.param_grid = param_grid.copy()
self.measures = [measure.lower() for measure in measures]
self.cv = cv
self.return_train_measures = return_train_measures
self.n_jobs = n_jobs
self.pre_dispatch = pre_dispatch
self.joblib_verbose = joblib_verbose
Expand Down Expand Up @@ -130,7 +134,8 @@ def fit(self, data):

delayed_list = (
delayed(fit_and_score)(self.algo_class(**params), trainset,
testset, self.measures)
testset, self.measures,
self.return_train_measures)
for params, (trainset, testset) in product(self.param_combinations,
cv.split(data))
)
Expand All @@ -156,10 +161,15 @@ def fit(self, data):
# (n_parameters_combinations, n_splits). This way we can easily compute
# the mean and std dev over all splits or over all param comb.
test_measures = dict()
train_measures = dict()
new_shape = (len(self.param_combinations), cv.get_n_folds())
for m in self.measures:
test_measures[m] = np.asarray([d[m] for d in test_measures_dicts])
test_measures[m] = test_measures[m].reshape(new_shape)
if self.return_train_measures:
train_measures[m] = np.asarray([d[m] for d in
train_measures_dicts])
train_measures[m] = train_measures[m].reshape(new_shape)

cv_results = dict()
best_index = dict()
Expand All @@ -171,12 +181,20 @@ def fit(self, data):
for split in range(cv.get_n_folds()):
cv_results['split{0}_test_{1}'.format(split, m)] = \
test_measures[m][:, split]

# cv_results: set mean and std over all splits (testset) for each
# param comb
mean_measures = test_measures[m].mean(axis=1)
cv_results['mean_test_{}'.format(m)] = mean_measures
if self.return_train_measures:
cv_results['split{0}_train_{1}'.format(split, m)] = \
train_measures[m][:, split]

# cv_results: set mean and std over all splits (testset and
# trainset) for each param comb
mean_test_measures = test_measures[m].mean(axis=1)
cv_results['mean_test_{}'.format(m)] = mean_test_measures
cv_results['std_test_{}'.format(m)] = test_measures[m].std(axis=1)
if self.return_train_measures:
mean_train_measures = train_measures[m].mean(axis=1)
cv_results['mean_train_{}'.format(m)] = mean_train_measures
cv_results['std_train_{}'.format(m)] = \
train_measures[m].std(axis=1)

# cv_results: set rank of each param comb
indices = cv_results['mean_test_{}'.format(m)].argsort()
Expand All @@ -186,11 +204,11 @@ def fit(self, data):

# set best_index, and best_xxxx attributes
if m in ('mae', 'rmse'):
best_index[m] = mean_measures.argmin()
best_index[m] = mean_test_measures.argmin()
elif m in ('fcp', ):
best_index[m] = mean_measures.argmax()
best_index[m] = mean_test_measures.argmax()
best_params[m] = self.param_combinations[best_index[m]]
best_score[m] = mean_measures[best_index[m]]
best_score[m] = mean_test_measures[best_index[m]]
best_estimator[m] = self.algo_class(**best_params[m])

# Cv results: set fit and train times (mean, std)
Expand Down
17 changes: 16 additions & 1 deletion tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def test_cv_results():
kf = KFold(3, shuffle=True, random_state=4)
param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
gs = GridSearchCV(SVD, param_grid, measures=['RMSE', 'mae'], cv=kf)
gs = GridSearchCV(SVD, param_grid, measures=['RMSE', 'mae'], cv=kf,
return_train_measures=True)
gs.fit(data)

# test keys split*_test_rmse, mean and std dev.
Expand All @@ -110,6 +111,20 @@ def test_cv_results():
gs.cv_results['split1_test_rmse'],
gs.cv_results['split2_test_rmse']], axis=0))

# test keys split*_train_mae, mean and std dev.
assert gs.cv_results['split0_train_rmse'].shape == (4,) # 4 param comb.
assert gs.cv_results['split1_train_rmse'].shape == (4,) # 4 param comb.
assert gs.cv_results['split2_train_rmse'].shape == (4,) # 4 param comb.
assert gs.cv_results['mean_train_rmse'].shape == (4,) # 4 param comb.
assert np.allclose(gs.cv_results['mean_train_rmse'],
np.mean([gs.cv_results['split0_train_rmse'],
gs.cv_results['split1_train_rmse'],
gs.cv_results['split2_train_rmse']], axis=0))
assert np.allclose(gs.cv_results['std_train_rmse'],
np.std([gs.cv_results['split0_train_rmse'],
gs.cv_results['split1_train_rmse'],
gs.cv_results['split2_train_rmse']], axis=0))

# test fit and train times dimensions.
assert gs.cv_results['mean_fit_time'].shape == (4,) # 4 param comb.
assert gs.cv_results['std_fit_time'].shape == (4,) # 4 param comb.
Expand Down

0 comments on commit 112c8f9

Please sign in to comment.