In [None]:
# Chapter 12. Model Selection

In [6]:
# 12.1 Selecting the Best Models Using Exhaustive Search

import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
# load data
iris = datasets.load_iris()
features, target = iris.data, iris.target

# create logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# create range of candidate penalty hyperparameter values
penalty = ['l1', 'l2']

# # create range of candidate regularization hyperparameter values
C = np.logspace(0, 4, 10)

# Create dictionary of hyperparameter candidates
hyperparameters = dict(C=C, penalty=penalty)

# create grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

# fit grid search
best_model = gridsearch.fit(features, target)

# show the best model
print(best_model.best_estimator_)

LogisticRegression(C=np.float64(7.742636826811269), max_iter=500, penalty='l1',
                   solver='liblinear')


In [7]:
np.logspace(0, 4, 10)

array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04])

In [10]:
# view best hyperparameters
print('best penalty: ', best_model.best_estimator_.get_params()['penalty'])
print('best C: ', best_model.best_estimator_.get_params()['C'])

best penalty:  l1
best C:  7.742636826811269


In [11]:
# predict target vector
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [22]:
# 12.2 Selecting the Best Models Using Randomized Search
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

iris = datasets.load_iris()
features, target = iris.data, iris.target

# create logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# Create range of candidate regularization penalty hyperparameter values
penalty = ['l1','l2']

# Create distribution of candidate regularization hyperparameter values
C = uniform(loc=0, scale=4)
# C =uniform(loc=0, scale=4).rvs(10)

# create a hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# create randomize search
randomizedsearch = RandomizedSearchCV(logistic, hyperparameters, random_state=1, 
                                      n_iter=100, cv=5, verbose=0, n_jobs=-1)

# Fit randomized search
best_model = randomizedsearch.fit(features, target)

print(best_model.best_estimator_)

LogisticRegression(C=np.float64(1.668088018810296), max_iter=500, penalty='l1',
                   solver='liblinear')


In [23]:
print('best penalty: ', best_model.best_estimator_.get_params()['penalty'])
print('best C: ', best_model.best_estimator_.get_params()['C'])

best penalty:  l1
best C:  1.668088018810296


In [24]:
uniform(loc=0, scale=4)

<scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x73fa9549e290>

In [25]:
# Define a uniform distribution between 0 and 4, sample 10 values
uniform(loc=0, scale=4).rvs(10)

array([1.87180442, 1.81167887, 0.40267542, 2.422375  , 1.66870518,
       1.22044736, 1.58645945, 1.57654496, 0.58810845, 3.97300242])

In [26]:
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [27]:
# 12.3 Selecting the Best Models from Multiple Learning Algorithms

import numpy as np 
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# set random seed 
np.random.seed(0)
 
iris = datasets.load_iris()
features, target = iris.data, iris.target

# create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create dictionary with candidate learning algorithms and their hyperparameter
search_space =[{"classifier": [LogisticRegression(max_iter=500, solver='liblinear')],
                "classifier__penalty": ['l1','l2'],
                "classifier__C": np.logspace(0, 4 , 10)},
               {"classifier": [RandomForestClassifier()],
                "classifier__n_estimators": [10, 100, 1000],
                "classifier__max_features": [1, 2, 3]}]

# create grid search
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)

best_model = gridsearch.fit(features, target)

print(best_model.best_estimator_)

Pipeline(steps=[('classifier',
                 LogisticRegression(C=np.float64(7.742636826811269),
                                    max_iter=500, penalty='l1',
                                    solver='liblinear'))])


In [28]:
# view best model
print(best_model.best_estimator_.get_params()['classifier'])

LogisticRegression(C=np.float64(7.742636826811269), max_iter=500, penalty='l1',
                   solver='liblinear')


In [29]:
# predict target vector
best_model.predict(features)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [34]:
# 12.4 Selecting the Best Models When Preprocessing
import numpy as np 
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# set random seed
np.random.seed(0)

iris =datasets.load_iris()
features, target = iris.data, iris.target

# Create a preprocessing object that includes StandardScaler features and PCA
preprocess =  FeatureUnion([("std", StandardScaler()), ("pca", PCA())])

# create a pipeline
pipe = Pipeline([("preprocess", preprocess),
                 ("classifier", LogisticRegression(max_iter=1000, solver='liblinear'))])

# Create space of candidate values
search_space = [{"preprocess__pca__n_components": [1, 2, 3],
                 "classifier__penalty": ['l1','l2'],
                 "classifier__C": np.logspace(0, 4, 10)}]

# create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)

# fit grid search
best_model = clf.fit(features, target)

print(best_model.best_estimator_)

Pipeline(steps=[('preprocess',
                 FeatureUnion(transformer_list=[('std', StandardScaler()),
                                                ('pca', PCA(n_components=1))])),
                ('classifier',
                 LogisticRegression(C=np.float64(7.742636826811269),
                                    max_iter=1000, penalty='l1',
                                    solver='liblinear'))])


In [35]:
best_model.best_estimator_.get_params()['preprocess__pca__n_components']

1

In [None]:
# 12.5 Speeding Up Model Selection with Parallelization
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

iris = datasets.load_iris()
features, target = iris.data, iris.target
 
# create logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# Create range of candidate regularization penalty hyperparameter values
penalty = ["l1", "l2"]

# Create range of candidate values for c , 1000 numbers between 10^0 to 10^4
C = np.logspace(0, 4, 1000)

# create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# create grid search
# or we can use n_jobs = 1  to use one core of computer
# gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs= 1, verbose=1)
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=1)

# fit grid search
best_model = gridsearch.fit(features, target)

print(best_model.best_estimator_)


Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


LogisticRegression(C=np.float64(5.926151812475554), max_iter=500, penalty='l1',
                   solver='liblinear')


In [None]:
# 12.6 Speeding Up Model Selection Using Algorithm Specific Methods
from sklearn import linear_model, datasets

iris = datasets.load_iris()
features, target = iris.data, iris.target

# Create cross-validated logistic regression
# Cs as list contains the candidate hyperparameter values to select from
# CS as integer generates a list of that number of candidate values
logit = linear_model.LogisticRegressionCV(Cs=100, max_iter=500, solver='liblinear')

# train model 
logit.fit(features, target)

print(logit)

LogisticRegressionCV(Cs=100, max_iter=500, solver='liblinear')


In [None]:
# 12.7 Evaluating Performances After Model Selection
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV, cross_val_score

iris = datasets.load_iris()
features, target = iris.data, iris.target

# create a logistic regression
logistic = linear_model.LogisticRegression(max_iter=500, solver='liblinear')

# Create range of 20 candidate values for C
C = np.logspace(0, 4, 20)

# Create hyperparameter options
hyperparameters = dict(C=C)

gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=0)

# Conduct nested cross-validation and output the average score
cross_val_score(gridsearch, features, target).mean()

np.float64(0.9733333333333334)

In [9]:
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1)

best_model = gridsearch.fit(features, target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [10]:
scores = cross_val_score(gridsearch, features, target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
