In [15]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import gc
import data_utils as dutils
import math
from pathlib import Path
from datetime import datetime
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import GridSearchCV, ShuffleSplit

sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")

%matplotlib inline

In [5]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def get_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_val), y_val),
                m.score(X_train, y_train), m.score(X_val, y_val)]
    return res

In [6]:
df = dutils.loadData()
X_train, y_train, X_val, y_val = dutils.getTrainTestSets(df)

In [16]:
clf = BayesianRidge(compute_score=True)
parameters = {
    "n_iter": [30, 50, 100, 200, 300],
    "alpha_1": [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    "alpha_2": [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    "lambda_1": [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    "lambda_2": [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],    
    }
cv = ShuffleSplit(test_size=0.01, n_splits=1)
grid = GridSearchCV(BayesianRidge(compute_score=True), parameters, cv=cv, verbose=1, return_train_score=True)
grid.fit(X_train, y_train)

Fitting 1 folds for each of 20480 candidates, totalling 20480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 20480 out of 20480 | elapsed:  2.1min finished


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=None, test_size=0.01, train_size=None),
             estimator=BayesianRidge(compute_score=True),
             param_grid={'alpha_1': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001,
                                     0.01, 0.1],
                         'alpha_2': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001,
                                     0.01, 0.1],
                         'lambda_1': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001,
                                      0.01, 0.1],
                         'lambda_2': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001,
                                      0.01, 0.1],
                         'n_iter': [30, 50, 100, 200, 300]},
             return_train_score=True, verbose=1)

In [25]:
grid.best_params_

{'alpha_1': 1e-08,
 'alpha_2': 0.1,
 'lambda_1': 0.1,
 'lambda_2': 1e-08,
 'n_iter': 30}

In [20]:
preds,std = grid.best_estimator_.predict(X_val, return_std=True)

In [21]:
grid.best_estimator_.score(X_val, y_val)

0.9998175933593976

In [22]:
get_score(grid.best_estimator_)

[0.05474545913643631,
 0.03494988221706945,
 0.9999847996538863,
 0.9998175933593976]

In [24]:
preds, y_val

(array([ 95.92288006,  96.34324094,  94.63926359,  93.42402384,
         93.55860575,  92.22712107,  93.33400038,  93.48145113,
         91.22687618,  89.91445115,  91.03108406,  91.92502802,
         90.14447116,  92.62571746,  92.01708136,  91.97641011,
         94.20978877,  93.6862123 ,  94.82054703,  95.60344819,
         95.33853151,  94.53516676,  94.98980459,  96.68476328,
         96.62891034,  98.07899899,  97.45258672,  97.24784389,
         95.67853099,  91.60011057,  92.82771742,  90.69487939,
         92.36140523,  94.18228887,  96.9768268 ,  97.19759717,
         99.54254153, 101.00771928]),
 array([ 95.91,  96.37,  94.64,  93.4 ,  93.67,  92.17,  93.3 ,  93.5 ,
         91.2 ,  89.92,  91.05,  91.91,  90.11,  92.64,  92.03,  91.95,
         94.27,  93.65,  94.8 ,  95.59,  95.34,  94.54,  95.  ,  96.67,
         96.65,  98.06,  97.44,  97.23,  95.68,  91.56,  92.8 ,  90.66,
         92.26,  94.16,  96.95,  97.19,  99.5 , 101.  ]))