### Use GridSearchCV to compare Ridge() and KNeighborsRegressor() in predicting rings of Abalone dataset. Add several values to one parameter in Ridge and one in KNeighborsRegressor.

In [60]:
import pandas as pd
import numpy as np
from matplotlib import cm, pyplot as plt
%matplotlib inline
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [61]:
names = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']
df = pd.read_csv('abalone.data',header=None,names=names)
# change M,F and I categorical variables as numerical using 0,1 and 2.
replace_list = {"Sex" : {"M": 0, "F" : 1, "I": 2}}
df.replace(replace_list,inplace=True)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [62]:
pipe = Pipeline([('classifier', Ridge())])
param_grid = [
    {'classifier': [Ridge()], 'classifier__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [KNeighborsRegressor()], 'classifier__n_neighbors': [2, 4, 6, 8, 10, 13, 16, 19, 25, 30, 35, 40]}
]

In [63]:
X = np.array(df.drop(['Rings'], 1))
y = df['Rings']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True)
grid.fit(X_train, y_train)
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'classifier': Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001), 'classifier__alpha': 0.1}

Best cross-validation score: 0.52
Test-set score: 0.54


### The cv_results_ property of a trained GridSearchCV() object provides a lot of information about the evaluation process. Print all mean_test_scores, and the parameter combinations to which they correspond.

In [64]:
results = pd.DataFrame(grid.cv_results_)
results[['mean_test_score', 'param_classifier__alpha', 'param_classifier__n_neighbors']]

Unnamed: 0,mean_test_score,param_classifier__alpha,param_classifier__n_neighbors
0,0.523304,0.001,
1,0.523444,0.01,
2,0.524452,0.1,
3,0.522915,1.0,
4,0.460647,10.0,
5,0.323693,100.0,
6,0.401419,,2.0
7,0.48391,,4.0
8,0.502708,,6.0
9,0.513201,,8.0


### Implement Extreme Learning Machine (next slide) and find the optimal combination of parameters in predicting rings of Abalone dataset. Report test mean absolute error (on a separate test set that you prepared in the beginning); compare to RandomForestRegressor. Mean absolute error is np.abs(y_test - model.predict(X_test)).mean().

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

In [90]:
from sklearn.base import BaseEstimator, TransformerMixin

#Apply non-linear function to new data `X_new = np.tanh(X)`. You need to write a custom Scikit-Learn model part for this.
class ApplyNonLinearFunction(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X[:,0] = np.tanh(X[:,0])
        return X

In [95]:
# Run GridSearchCV that optimizes k (5...1000) and a (0.001 ... 1000).
pipe = make_pipeline(
    GaussianRandomProjection(),
    ApplyNonLinearFunction(),
    Ridge()
)
param_grid = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'gaussianrandomprojection__n_components': [5, 10, 50, 100, 200, 500, 1000]}

In [96]:
import warnings
warnings.filterwarnings('ignore')

grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True)
grid.fit(X_train, y_train)
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'gaussianrandomprojection__n_components': 200, 'ridge__alpha': 0.001}

Best cross-validation score: 0.53
Test-set score: 0.55


In [97]:
np.abs(y_test - grid.predict(X_test)).mean()

1.5533537334107967

In [101]:
from sklearn.ensemble import RandomForestRegressor
# compare to RandomForestRegressor
# Run GridSearchCV that optimizes k (5...1000) and a (0.001 ... 1000).
pipe = make_pipeline(
    GaussianRandomProjection(),
    ApplyNonLinearFunction(),
    RandomForestRegressor()
)
param_grid = {'randomforestregressor__max_depth': [2, 4, 6, 8, 10, 20, 50],
'gaussianrandomprojection__n_components': [5, 10, 50, 100, 200, 500, 1000]}

In [102]:
import warnings
warnings.filterwarnings('ignore')

grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True)
grid.fit(X_train, y_train)
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'gaussianrandomprojection__n_components': 1000, 'randomforestregressor__max_depth': 4}

Best cross-validation score: 0.54
Test-set score: 0.56


In [103]:
np.abs(y_test - grid.predict(X_test)).mean()

1.5076471100507645