The purpose of this notebook is to serve as the canonical example of the SVR regression approach for this project for demonstration purposes. It emulates the following run of `run_experiments.py`:

    python3 run_experiments.py ../data/plants5.csv migration_m --algo SVR --cats oceanity dispersal_mode BreedSysCode Grime --drop Taxon migr_sterr_m "shift + 2SE" signif_shift signif_shift2 dispmode01 DispModeEng "shift + 2SE" --benchmark --save --na feature

In [2]:
import TraitData

In [13]:
DATAFILE = "../data/plants5.csv"
RESPONSE = "migration_m"
CATEGORIES = [
     'oceanity',
     'dispersal_mode',
     'BreedSysCode',
     'Grime'
]
DROPVARS = [
    "Taxon",
    "migr_sterr_m",
    "shift + 2SE",
    "signif_shift",
    "signif_shift2",
    "DispModeEng"
]
DROPNA = 1 ## drop features (0 for drop samples)

data = TraitData.TraitData(DATAFILE,
                           RESPONSE,
                           DROPVARS, 
                           CATEGORIES,
                           DROPNA)
X, x_test, Y, y_test = data.train_test_split(0.30)


data.X.columns.values

array(['Bio1_mean_nosyn', 'Bio1_std_nosyn', 'Bio1_var_nosyn',
       'Bio1_mean_inclsyn', 'Bio1_std_inclsyn', 'Bio1_var_inclsyn',
       'oceanity_ks', 'oceanity_o', 'oceanity_os', 'oceanity_sks',
       'oceanity_so', 'oceanity_sos', 'dispersal_mode_animal',
       'dispersal_mode_gravity', 'dispersal_mode_water',
       'dispersal_mode_wind', 'BreedSysCode_1.0', 'BreedSysCode_2.0',
       'BreedSysCode_3.0', 'BreedSysCode_4.0', 'Grime_c', 'Grime_cs',
       'Grime_csr', 'Grime_r', 'Grime_s', 'Grime_sr'], dtype=object)

In [27]:
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut
def kFoldCV(model, features, target, K=5):
    """
        Perform 5-fold cross-validation on samples
    """
    return -cross_val_score(model, features, target,
                            cv=KFold(K),
                            scoring='neg_mean_squared_error', n_jobs=1).mean()
def LeaveOneOutCV(model, features, target):
    """
        Perform leave-one-out cross validation on samples
    """
    return -cross_val_score(model, features, target,
                           cv=LeaveOneOut(),
                           scoring="neg_mean_squared_error").mean()

In [18]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale
from numpy import logspace

baseModel = SVR()
params_grid = {
    'C'     : logspace(-3, 3, 13), 
    'gamma' : logspace(-3, 3, 13) 
}
gridSearch = GridSearchCV(baseModel,
                         param_grid = params_grid,
                         scoring="neg_mean_squared_error",
                         error_score = 0,
                         n_jobs = -1,
                         cv=KFold(5))
gridSearch.fit(scale(X), Y)

bestModel = gridSearch.best_estimator_
print("GridSearch Parameters: ", bestModel.get_params())

GridSearch Parameters:  {'kernel': 'rbf', 'shrinking': True, 'cache_size': 200, 'epsilon': 0.1, 'verbose': False, 'tol': 0.001, 'coef0': 0.0, 'degree': 3, 'gamma': 1000.0, 'max_iter': -1, 'C': 10.0}


In [22]:
print("MSE on test data (5-fold): ", kFoldCV(bestModel, scale(x_test), y_test))

MSE on test data:  33.9784765896


In [28]:
print("MSE on test data (LOO): ", LeaveOneOutCV(bestModel, scale(x_test), y_test))

MSE on test data (LOO):  35.0724109303
