Modified grid search tests to run faster

NicolasHug · Jan 6, 2018 · a997313 · a997313
1 parent 9a6c673
commit a997313
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ Enhancements
 * GridSearch is now parallel, using joblib.
 * default data directory can now be custom with env variable
   SURPRISE_DATA_FOLDER
+* Algorithms using a random initialization (e.g. SVD, NMF, CoClustering) now
+  have a random_state parameter for seeding the RNG.
 
 API Changes
 -----------

diff --git a/TODO.md b/TODO.md
@@ -3,14 +3,6 @@ TODO
 
 
 * Update README example before new rewlease, as well as computation times
-* all algorithms using random initialization should allow to define
-  random_state. This is paramount for having correct gridsearch results (else
-  different initializations are used for the various parameter combinations).
-  When done, change tests of these algorithms so that they all use the same
-  seed. Right now tests about different RMSE values are not relevant. Also, use
-  SVD on test file when possible for grid search tests. Right now we use knn on
-  train (test does not have enough ratings for parameters to be impactful) and
-  it's slower.
 * Make all fit methods (for algo and GridSearch) return self. Update docs on
   building custom algorithms, and on getting started -> gridsearch (add
   example?).
@@ -24,6 +16,7 @@ TODO
 Done:
 -----
 
+* all algorithms using random initialization now have a random_state parameter.
 * CV iterators:
   - Write basic CV iterators
   - evaluate -> rewrite to use CV iterators. Rename it into cross_validate.

diff --git a/tests/test_search.py b/tests/test_search.py
@@ -11,7 +11,6 @@
 from surprise import Dataset
 from surprise import Reader
 from surprise import SVD
-from surprise import KNNBasic
 from surprise.model_selection import KFold
 from surprise.model_selection import PredefinedKFold
 from surprise.model_selection import GridSearchCV
@@ -64,13 +63,14 @@ def test_same_splits():
     check their RMSE scores are the same once averaged over the splits, which
     should be enough). We use as much parallelism as possible."""
 
-    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
+    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
     data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'))
     kf = KFold(3, shuffle=True, random_state=4)
 
     # all RMSE should be the same (as param combinations are the same)
-    param_grid = {'k': [1, 1], 'min_k': [3, 3]}
-    gs = GridSearchCV(KNNBasic, param_grid, measures=['RMSE'], cv=kf,
+    param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
+                  'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
+    gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=kf,
                       n_jobs=-1)
     gs.fit(data)
 
@@ -88,11 +88,12 @@ def test_same_splits():
 def test_cv_results():
     '''Test the cv_results attribute'''
 
-    f = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
+    f = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
     data = Dataset.load_from_file(f, Reader('ml-100k'))
     kf = KFold(3, shuffle=True, random_state=4)
-    param_grid = {'k': [1, 10], 'sim_options': {'name': ['msd', 'cosine']}}
-    gs = GridSearchCV(KNNBasic, param_grid, measures=['RMSE', 'mae'], cv=kf)
+    param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
+                  'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
+    gs = GridSearchCV(SVD, param_grid, measures=['RMSE', 'mae'], cv=kf)
     gs.fit(data)
 
     # test keys split*_test_rmse, mean and std dev.