# Classification Hyperparameter Tuning

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [5]:
import datetime
print(datetime.datetime.now())

2021-11-11 16:49:27.033886


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [7]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.22.2.post1.


In [8]:
# Create the 'out' directory to store output images
import os
if not os.path.exists('out'):
    os.makedirs('out')

# Load Data

In [9]:
df = pd.read_csv('https://raw.githubusercontent.com/stepthom/869_course/main/data/GermanCredit.csv')
df['Class'] = df['Class'].map({'Good': 1, 'Bad': 0})
df.head()
X = df.drop(['Class'], axis=1)
y = df[['Class']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,Class,CheckingAccountStatus.lt.0,CheckingAccountStatus.0.to.200,CheckingAccountStatus.gt.200,CheckingAccountStatus.none,CreditHistory.NoCredit.AllPaid,CreditHistory.ThisBank.AllPaid,CreditHistory.PaidDuly,CreditHistory.Delay,CreditHistory.Critical,Purpose.NewCar,Purpose.UsedCar,Purpose.Furniture.Equipment,Purpose.Radio.Television,Purpose.DomesticAppliance,Purpose.Repairs,Purpose.Education,Purpose.Vacation,Purpose.Retraining,Purpose.Business,Purpose.Other,SavingsAccountBonds.lt.100,SavingsAccountBonds.100.to.500,SavingsAccountBonds.500.to.1000,SavingsAccountBonds.gt.1000,SavingsAccountBonds.Unknown,EmploymentDuration.lt.1,EmploymentDuration.1.to.4,EmploymentDuration.4.to.7,EmploymentDuration.gt.7,EmploymentDuration.Unemployed,Personal.Male.Divorced.Seperated,Personal.Female.NotSingle,Personal.Male.Single,Personal.Male.Married.Widowed,Personal.Female.Single,OtherDebtorsGuarantors.None,OtherDebtorsGuarantors.CoApplicant,OtherDebtorsGuarantors.Guarantor,Property.RealEstate,Property.Insurance,Property.CarOther,Property.Unknown,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,6,1169,4,4,67,2,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0
1,48,5951,2,2,22,1,1,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0


In [10]:
# Helper function to print out the results of hyperparmater tuning in a nice table.

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    #results['mean_fit_time'] = cv_results['mean_fit_time']
    #results['mean_score_time'] = cv_results['mean_score_time']
    #results['mean_train_score'] = cv_results['mean_train_score']
    #results['std_train_score'] = cv_results['std_train_score']
    results['mean_val_score'] = cv_results['mean_test_score']
    #results['std_val_score'] = cv_results['std_test_score']
    results['rank_val_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_val_score'], ascending=False)
    return results

# Decision Trees

In [21]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=0)

scores = cross_val_score(clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: {:.4f}".format(np.mean(scores)))

Mean Accuracy: 0.6970


In [22]:
clf2 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5, random_state=0)

scores = cross_val_score(clf2, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: {:.4f}".format(np.mean(scores)))

Mean Accuracy: 0.7010


In [23]:
clf3 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=6, random_state=0)

scores = cross_val_score(clf3, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: {:.4f}".format(np.mean(scores)))

Mean Accuracy: 0.7040


In [None]:
class_names = [str(x) for x in clf.classes_]

## Hyperparameter Tuning

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

clf = DecisionTreeClassifier(random_state=42)

params = {'criterion': ('gini', 'entropy'), 
          'splitter': ('best', 'random'), 
          'class_weight': ('balanced', None), 
          'max_depth': [2, 5, 10, 20], 
          'min_samples_leaf': [1, 5, 10],
          'max_features':[0.25, 0.5, 0.75, 1.0]}

search = GridSearchCV(clf, params, scoring='f1_macro', cv=10, verbose=1)
search = search.fit(X_train, y_train)

Fitting 10 folds for each of 384 candidates, totalling 3840 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3840 out of 3840 | elapsed:   32.2s finished


In [None]:
y_pred = search.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.95      0.61        59
           1       0.96      0.51      0.67       141

    accuracy                           0.64       200
   macro avg       0.70      0.73      0.64       200
weighted avg       0.81      0.64      0.65       200



In [None]:
search.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 0.5,
 'min_samples_leaf': 10,
 'splitter': 'random'}

In [None]:
cv_results_to_df(search.cv_results_)

Unnamed: 0,class_weight,criterion,max_depth,max_features,min_samples_leaf,splitter,mean_val_score,rank_val_score
35,balanced,gini,5,0.50,10,random,0.654202,1
131,balanced,entropy,5,0.50,10,random,0.650364,2
129,balanced,entropy,5,0.50,5,random,0.647728,3
280,,gini,20,0.75,10,best,0.646933,4
264,,gini,20,0.25,1,best,0.646246,5
...,...,...,...,...,...,...,...,...
194,,gini,2,0.25,5,best,0.411329,373
196,,gini,2,0.25,10,best,0.411329,373
199,,gini,2,0.50,1,random,0.411329,373
292,,entropy,2,0.25,10,best,0.411329,373


### Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

clf = DecisionTreeClassifier(random_state=42)

params = {"criterion": ["gini", "entropy"],
          "splitter": ["best", "random"],
          "class_weight": ['balanced', None], 
          "max_depth": randint(2, 21),
          "min_samples_leaf": randint(1, 11),
          "max_features": uniform(0.0, 1.0)}

search = RandomizedSearchCV(clf, param_distributions=params, n_iter=1000, scoring='f1_macro', cv=10, verbose=1)
search = search.fit(X_train, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 10000 out of 10000 | elapsed:  1.4min finished


In [None]:
y_pred = search.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.53      0.53        59
           1       0.80      0.80      0.80       141

    accuracy                           0.72       200
   macro avg       0.66      0.66      0.66       200
weighted avg       0.72      0.72      0.72       200



In [None]:
search.best_params_

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 11,
 'max_features': 0.5712404327863612,
 'min_samples_leaf': 6,
 'splitter': 'random'}

In [None]:
cv_results_to_df(search.cv_results_)

Unnamed: 0,class_weight,criterion,max_depth,max_features,min_samples_leaf,splitter,mean_val_score,rank_val_score
771,,entropy,11,0.571240,6,random,0.672105,1
461,,gini,20,0.450158,10,best,0.670830,2
751,,entropy,19,0.234783,9,random,0.658955,3
545,,gini,9,0.476848,5,best,0.655618,4
332,balanced,gini,13,0.478803,4,best,0.655055,5
...,...,...,...,...,...,...,...,...
98,,gini,2,0.005240,10,random,0.411329,989
221,,entropy,3,0.022226,7,best,0.410893,997
544,,gini,4,0.006804,7,random,0.410451,998
496,,entropy,3,0.025392,3,random,0.410438,999
