## Pickling

 - technique that allows you to serialize and export your models (or any other data)
 - useful way to store your work after long training jobs

In [7]:
# import model, define X & y
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso

# load data
df = pd.read_csv('../Data/housing.csv')
X  = df.iloc[:, :-1]
y  = df['PRICE']

In [19]:
# initialize model & fit
lasso = Lasso()
lasso.fit(X, y)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [8]:
# this will allow us to export our model and re-use it elsewhere
# see the other untitled notebook to see how you import it
import pickle

with open('mod.pkl', 'wb') as mod:
    pickle.dump(lasso, mod)

## Grid Search 

 - way to use brute force to search for all parameters of your model
 - useful step to use after EDA and feature engineering

In [1]:
# load in GridSearch module, and RandomForest
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
# this dictionary contains values to test for each parameter, which is listed as a key
grid_params = {
    'max_features': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
    'n_estimators': [10, 50, 100],
    'min_samples_leaf': [1, 5, 10, 25, 50]
}

In [3]:
# initialize, and check params
rfc = RandomForestRegressor(n_jobs=-1)
rfc

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [5]:
# arguments:  estimators to use, number of folds for cross_validation, and your dictionary w/ 
# parameters
Grid = GridSearchCV(estimator=rfc, cv=5, param_grid=grid_params)

In [8]:
# And fit -- this could take awhile!
Grid.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_features': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], 'n_estimators': [10, 50, 100], 'min_samples_leaf': [1, 5, 10, 25, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
# this will return the parameters that gave the best results
Grid.best_params_

{'max_features': 0.4, 'min_samples_leaf': 5, 'n_estimators': 10}

In [10]:
# this returns a dictionary, with values generated for each version of the model you tested
Grid.cv_results_



{'mean_fit_time': array([0.83724227, 0.21048903, 0.22969542, 0.06480899, 0.16377511,
        0.25801964, 0.0668108 , 0.18093004, 0.20382123, 0.06317234,
        0.13835578, 0.18642378, 0.06598787, 0.15194135, 0.19569345,
        0.05262017, 0.18184648, 0.25064316, 0.0640862 , 0.18930268,
        0.24073834, 0.06876965, 0.19414353, 0.22572904, 0.06597981,
        0.17895441, 0.21836796, 0.08984704, 0.22968621, 0.29385047,
        0.07965517, 0.23717022, 0.26549172, 0.07577343, 0.20302234,
        0.22279639, 0.07743554, 0.18238649, 0.22910647, 0.06780834,
        0.15225301, 0.19405203, 0.06445837, 0.17473865, 0.22171121,
        0.07068963, 0.2049458 , 0.25469198, 0.07188663, 0.20834742,
        0.26323943, 0.07095633, 0.17486358, 0.23054523, 0.06065135,
        0.17780991, 0.2877068 , 0.06567936, 0.18698754, 0.21966581,
        0.07330737, 0.2612042 , 0.30930934, 0.07570605, 0.20277629,
        0.24231257, 0.08063164, 0.22189975, 0.26993065, 0.08141913,
        0.19868975, 0.25981455,

In [11]:
# convenient to load this into a dataframe, to better view its results
grid_results = pd.DataFrame(Grid.cv_results_)
grid_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.837242,1.532662,0.137253,0.047355,0.2,1,10,"{'max_features': 0.2, 'min_samples_leaf': 1, '...",0.664692,0.714682,...,0.374030,0.490833,81,0.975065,0.967663,0.956488,0.979908,0.969051,0.969635,0.007901
1,0.210489,0.010772,0.124109,0.007875,0.2,1,50,"{'max_features': 0.2, 'min_samples_leaf': 1, '...",0.725826,0.791020,...,0.545084,0.283051,50,0.978808,0.974994,0.970690,0.984484,0.982326,0.978260,0.004971
2,0.229695,0.009903,0.118321,0.006714,0.2,1,100,"{'max_features': 0.2, 'min_samples_leaf': 1, '...",0.690542,0.758533,...,0.535446,0.236550,52,0.979991,0.978241,0.975416,0.984952,0.981163,0.979953,0.003161
3,0.064809,0.016709,0.123606,0.008611,0.2,5,10,"{'max_features': 0.2, 'min_samples_leaf': 5, '...",0.552088,0.762288,...,0.512766,0.172448,63,0.874961,0.871851,0.782946,0.896750,0.843260,0.853954,0.039369
4,0.163775,0.027925,0.120218,0.002105,0.2,5,50,"{'max_features': 0.2, 'min_samples_leaf': 5, '...",0.672767,0.751675,...,0.492090,0.215501,71,0.881289,0.851465,0.856600,0.905478,0.877454,0.874457,0.019316
5,0.258020,0.040480,0.115124,0.002950,0.2,5,100,"{'max_features': 0.2, 'min_samples_leaf': 5, '...",0.669864,0.740851,...,0.473058,0.236209,73,0.883017,0.863584,0.838940,0.912063,0.876536,0.874828,0.023961
6,0.066811,0.007537,0.124494,0.007874,0.2,10,10,"{'max_features': 0.2, 'min_samples_leaf': 10, ...",0.630756,0.697546,...,0.494386,0.151891,70,0.798015,0.784666,0.717407,0.864836,0.735879,0.780161,0.051784
7,0.180930,0.033875,0.125031,0.007291,0.2,10,50,"{'max_features': 0.2, 'min_samples_leaf': 10, ...",0.650407,0.714605,...,0.431585,0.249814,78,0.819102,0.783502,0.765602,0.850413,0.789898,0.801703,0.029831
8,0.203821,0.014191,0.118143,0.005446,0.2,10,100,"{'max_features': 0.2, 'min_samples_leaf': 10, ...",0.612149,0.684267,...,0.467557,0.154338,74,0.801168,0.790448,0.744392,0.868705,0.804875,0.801918,0.039776
9,0.063172,0.001363,0.115274,0.001780,0.2,25,10,"{'max_features': 0.2, 'min_samples_leaf': 25, ...",0.502506,0.670466,...,0.263809,0.309177,99,0.686664,0.678674,0.545963,0.749778,0.638056,0.659827,0.067239


In [12]:
# sort the rows by their highest test score
grid_results.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
33,0.075773,0.007419,0.127579,0.012758,0.4,5,10,"{'max_features': 0.4, 'min_samples_leaf': 5, '...",0.759879,0.844738,...,0.673682,0.153816,1,0.911802,0.909605,0.892976,0.936575,0.915583,0.913308,0.013971
46,0.204946,0.014164,0.120121,0.007557,0.5,1,50,"{'max_features': 0.5, 'min_samples_leaf': 1, '...",0.755386,0.863149,...,0.666336,0.184254,2,0.981631,0.978837,0.979893,0.986491,0.980832,0.981537,0.002647
47,0.254692,0.012598,0.115289,0.006896,0.5,1,100,"{'max_features': 0.5, 'min_samples_leaf': 1, '...",0.763938,0.860562,...,0.659012,0.180616,3,0.984083,0.983177,0.980008,0.987489,0.983567,0.983665,0.002384
49,0.208347,0.022103,0.115135,0.005718,0.5,5,50,"{'max_features': 0.5, 'min_samples_leaf': 5, '...",0.752754,0.856948,...,0.658619,0.155379,4,0.918081,0.913259,0.898575,0.945113,0.908076,0.916621,0.015645
62,0.309309,0.045359,0.117833,0.007485,0.6,1,100,"{'max_features': 0.6, 'min_samples_leaf': 1, '...",0.769179,0.866195,...,0.658172,0.185553,5,0.985434,0.980681,0.979287,0.987170,0.981918,0.982898,0.002953
90,0.067919,0.005169,0.123010,0.004263,0.8,1,10,"{'max_features': 0.8, 'min_samples_leaf': 1, '...",0.756370,0.861020,...,0.658000,0.161479,6,0.979567,0.969565,0.971386,0.979468,0.974174,0.974832,0.004098
65,0.242313,0.006617,0.115801,0.005669,0.6,5,100,"{'max_features': 0.6, 'min_samples_leaf': 5, '...",0.759523,0.869465,...,0.657166,0.174786,7,0.923424,0.916723,0.897542,0.949119,0.916960,0.920753,0.016620
76,0.240963,0.031615,0.122963,0.009887,0.7,1,50,"{'max_features': 0.7, 'min_samples_leaf': 1, '...",0.758610,0.862853,...,0.656292,0.182968,8,0.982920,0.978571,0.978699,0.986612,0.983420,0.982045,0.003059
61,0.261204,0.030451,0.116824,0.005366,0.6,1,50,"{'max_features': 0.6, 'min_samples_leaf': 1, '...",0.772119,0.865476,...,0.653013,0.177449,9,0.984468,0.983222,0.978606,0.986031,0.980642,0.982594,0.002662
77,0.283978,0.024461,0.122646,0.008844,0.7,1,100,"{'max_features': 0.7, 'min_samples_leaf': 1, '...",0.766007,0.867278,...,0.650958,0.185087,10,0.983763,0.982046,0.978489,0.987217,0.983413,0.982986,0.002821


## Interpreting Grid Results:

 - Often very different model results will give very similar results, making it hard to tell how much of the best results were due to small chance
 - It's often useful to do data exploration on your cv results, to better understand what's driving them
 - Common things to look for:
  - general patterns for what parameter value drives model results, vs. just looking at the highest ranking version
  - useful cutoff points in parameters that give you similar to results, to best figure out how to reduce future fitting time

In [33]:
# how does the n_estimators argument effect accuracy?
grid_results.groupby('param_n_estimators')['mean_test_score'].mean()

param_n_estimators
10     0.495344
50     0.520959
100    0.518378
Name: mean_test_score, dtype: float64

In [13]:
# and max features?  -- notice how there's a sharp cutoff @ 0.4?  
# this would be a sensible way to train your tree on future versions of your data if performance
# is an issue
grid_results.groupby('param_max_features')['mean_test_score'].mean()

param_max_features
0.2    0.382335
0.3    0.471250
0.4    0.531868
0.5    0.552771
0.6    0.544758
0.7    0.539657
0.8    0.545608
Name: mean_test_score, dtype: float64

## Pipelines

 - separate module in Scikit Learn that allows you to chain together different processing functions
 - Useful for modularizing log-transforms, scaling, and other repetitive data processing methods
 - can be used with anything in scikit-learn that uses the transformer api
  - ie, anything that uses the `transform`, and `fit_transform` methods
 - also makes it easier to build models off the same dataset without changing the original
  - ie, use a scaler & function transformer w/ Lasso, so you can fit a random forest on the original dataset

In [16]:
# import scaler and the pipeline function
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [17]:
# initialize
sc = StandardScaler()

In [21]:
# load both the scaler and lasso into the make_pipeline function
pipe = make_pipeline(sc, lasso)

In [22]:
# returns information about all of its individual sub-components
pipe

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [24]:
# fit it like you normally would -- this will scale the data automatically
pipe.fit(X, y)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [25]:
# importantly -- this will invoke all the processing steps before predicting!
pipe.predict(X)

  Xt = transform.transform(Xt)


array([29.50642173, 25.29187532, 30.77520395, 30.03821235, 29.43160435,
       26.71405241, 23.68929336, 20.97261853, 13.50664087, 21.32119187,
       21.11068147, 23.26305697, 21.5745648 , 21.91653509, 21.45790963,
       21.36514617, 22.67691442, 18.87110923, 18.09304666, 19.54656433,
       14.07338121, 19.20315592, 17.46722858, 15.61460268, 17.82141528,
       16.28041677, 18.09777783, 17.63563011, 21.75440956, 22.83834076,
       13.80885512, 19.97781172, 11.93381383, 15.87061896, 16.18996087,
       22.26755867, 21.01427882, 22.39870055, 22.1647288 , 28.04540917,
       30.86598651, 28.69181712, 25.88275245, 25.25750796, 23.65051871,
       21.84161508, 20.28668284, 18.9120323 , 10.51585273, 18.55719878,
       21.99924592, 24.58035353, 28.1782064 , 24.63058019, 18.37026561,
       30.57921513, 27.12942798, 31.0659274 , 24.16478609, 22.16223876,
       19.48769642, 19.68293983, 25.44389982, 25.24832713, 27.96924784,
       28.06173263, 23.35077478, 23.02471218, 19.44977056, 22.71

In [45]:
# this means you could take your random forest, and use it like you normally would, without having
# to create a separate dataset
rfc.fit(X, y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)