## Generate regression problem, scaling, split the data, create evaluation function

In [1]:
import sklearn
import matplotlib
from sklearn.datasets import make_regression
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler

In [2]:
X_initial, y = make_regression(n_samples=10000, n_features=100, n_informative=10, noise=0.5)
scaler = StandardScaler()
X = scaler.fit_transform(X_initial)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [4]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    evs = explained_variance_score(test_labels, predictions, multioutput='raw_values')
    mse = mean_squared_error(test_labels, predictions, multioutput='raw_values')
    R2_score = r2_score(test_labels, predictions)
    print('explained variance score = {}.'.format(evs))
    print('mse = {}.'.format(mse))
    print('r2 score = {:0.2f}%.'.format(R2_score))
    return "Evaluation ended"

# Random Forest

## Random Search of parameters for Random forest

In [5]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 60, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features}
print(random_grid)

{'n_estimators': [60, 64, 68, 73, 77, 82, 86, 91, 95, 100], 'max_features': ['auto', 'sqrt']}


  from numpy.core.umath_tests import inner1d


In [6]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] n_estimators=60, max_features=auto ..............................
[CV] n_estimators=60, max_features=auto ..............................
[CV] n_estimators=60, max_features=auto ..............................
[CV] n_estimators=64, max_features=auto ..............................
[CV] ............... n_estimators=60, max_features=auto, total=  31.5s
[CV] n_estimators=64, max_features=auto ..............................
[CV] ............... n_estimators=60, max_features=auto, total=  31.7s
[CV] n_estimators=64, max_features=auto ..............................
[CV] ............... n_estimators=60, max_features=auto, total=  31.7s
[CV] n_estimators=68, max_features=auto ..............................
[CV] ............... n_estimators=64, max_features=auto, total=  34.0s
[CV] n_estimators=68, max_features=auto ..............................
[CV] ............... n_estimators=64, max_features=auto, total=  38.6s
[CV] n_estimator

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.2min


[CV] ............... n_estimators=64, max_features=sqrt, total=   5.3s
[CV] n_estimators=68, max_features=sqrt ..............................
[CV] ............... n_estimators=68, max_features=sqrt, total=   6.0s
[CV] n_estimators=68, max_features=sqrt ..............................
[CV] ............... n_estimators=68, max_features=sqrt, total=   6.0s
[CV] n_estimators=73, max_features=sqrt ..............................
[CV] ............... n_estimators=68, max_features=sqrt, total=   5.8s
[CV] n_estimators=73, max_features=sqrt ..............................
[CV] ............... n_estimators=73, max_features=sqrt, total=   6.3s
[CV] n_estimators=73, max_features=sqrt ..............................
[CV] ............... n_estimators=73, max_features=sqrt, total=   6.1s
[CV] n_estimators=77, max_features=sqrt ..............................
[CV] ............... n_estimators=73, max_features=sqrt, total=   6.0s
[CV] n_estimators=77, max_features=sqrt ..............................
[CV] .

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  7.3min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'n_estimators': [60, 64, 68, 73, 77, 82, 86, 91, 95, 100], 'max_features': ['auto', 'sqrt']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

## Get best parameters from random search

In [7]:
rf_random.best_params_

{'max_features': 'auto', 'n_estimators': 95}

## Create regressor based on the best parameters

In [8]:
regressor = RandomForestRegressor(max_features='auto', n_estimators=95)
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=95, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

# Ridge regressor

In [9]:
from sklearn.linear_model import RidgeCV
clf = RidgeCV(alphas=[0.1, 1.0, 10.0])

In [10]:
clf.fit(X_train, y_train) 

RidgeCV(alphas=[0.1, 1.0, 10.0], cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

## Look best alpha

In [11]:
clf.alpha_

0.1

## Evaluate both models

In [12]:
print("RANDOM FOREST")
print(evaluate(regressor, X_test, y_test))
print()
print("RIDGE REGRESSOR")
print(evaluate(clf, X_test, y_test))

RANDOM FOREST
explained variance score = [0.86989171].
mse = [3999.4293343].
r2 score = 0.87%.
Evaluation ended

RIDGE REGRESSOR
explained variance score = [0.99999168].
mse = [0.25571225].
r2 score = 1.00%.
Evaluation ended


## Results

The winner is ridge regression with alpha 0.1.