# Milestone 4
Grant Perkins

## Abstract
## Overview and Motivation
## Related Work
## Initial Questions
## Data
## Exploratory Data Analysis
## Model Revision
## Full Analysis


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.svm
import sklearn.metrics
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.ensemble
from sklearn.pipeline import Pipeline
import sklearn.model_selection

from dataset import MBTADataset

In [2]:
window_size = 14
dataset = MBTADataset(window_size)
train_X, train_y, test_X, test_y = dataset.make_sklearn_dataset()

1429 samples in train set
358 samples in test set


In [3]:
# make list of scalers, decompositions, and models for gridsearch
scalers = {"standard_scaler": sklearn.preprocessing.StandardScaler(),
           "minmax_scaler": sklearn.preprocessing.MinMaxScaler(),
           "normal_scaler": sklearn.preprocessing.Normalizer(),
           "none": None}

decompositions = {"pca": [sklearn.decomposition.PCA(), {"n_components": [*range(14, 70 + 1, 14)]}],
                  "none": None}

models = {"svm": [sklearn.svm.SVR(max_iter=1000), {"kernel": ["linear", "poly", "rbf", "sigmoid"]}],
          "rf": [sklearn.ensemble.RandomForestRegressor(),
                 {"n_estimators": [*range(25, 100 + 1, 25)], "criterion": ["squared_error", "poisson"]}]}

In [6]:
# make all combinations of scalers, decompositions, and models.
# do grid search on these iterations.
# find best combo with best params.

print("Beginning exhaustive search...")
best = {"score": 100, "steps": None, "parameters": None}
for scaler_name, scaler in scalers.items():
    for decomposition_name, decomposition_data in decompositions.items():
        for model_name, model_data in models.items():
            print(f"Starting scaler={scaler_name}, decomposition={decomposition_name}, model={model_name}")
            param_grid = {}
            steps = []
            # use a scaler?
            if scaler_name != "none":
                steps.append((scaler_name, scaler))
            # use a decomposition?
            if decomposition_name != "none":
                decomposition, params = decomposition_data
                # add all parameters to param grid
                for param_name, args in params.items():
                    param_grid.update({f"{decomposition_name}__{param_name}": args})
                steps.append((decomposition_name, decomposition))
            model, params = model_data
            for param_name, args in params.items():
                # add all parameters to param grid
                param_grid.update({f"{model_name}__{param_name}": args})
            steps.append((model_name, model))

            pipe = Pipeline(steps=steps)

            search = sklearn.model_selection.GridSearchCV(pipe, param_grid, n_jobs=5, scoring="neg_mean_squared_error",
                                                          error_score="raise")
            search.fit(train_X, train_y)
            print(f"Average MSE across 5 folds:{-search.best_score_:0.9f}")
            print("Parameters used:", search.best_params_)
            # using negative MSE, we try to maximize score
            if search.best_score_ < best["score"]:
                best["score"] = -search.best_score_
                best["steps"] = steps
                best["parameters"] = param_grid
                print("NEW BEST FOUND")
            print()
print("Best found after search.")
print(best)

Beginning exhaustive search...
Starting scaler=standard_scaler, decomposition=pca, model=svm
Average MSE across 5 folds:0.004651216
Parameters used: {'pca__n_components': 28, 'svm__kernel': 'rbf'}
NEW BEST FOUND

Starting scaler=standard_scaler, decomposition=pca, model=rf
Average MSE across 5 folds:0.004757600
Parameters used: {'pca__n_components': 28, 'rf__criterion': 'squared_error', 'rf__n_estimators': 25}
NEW BEST FOUND

Starting scaler=standard_scaler, decomposition=none, model=svm
Average MSE across 5 folds:0.005212253
Parameters used: {'svm__kernel': 'rbf'}
NEW BEST FOUND

Starting scaler=standard_scaler, decomposition=none, model=rf
Average MSE across 5 folds:0.003219301
Parameters used: {'rf__criterion': 'squared_error', 'rf__n_estimators': 100}
NEW BEST FOUND

Starting scaler=minmax_scaler, decomposition=pca, model=svm
Average MSE across 5 folds:0.004668959
Parameters used: {'pca__n_components': 28, 'svm__kernel': 'rbf'}
NEW BEST FOUND

Starting scaler=minmax_scaler, decompo