# We will use grid search to find the best candidate model-decision tree classifier- for our data with -gridsearchcv- which uses cross validation
## warning: Grid search is very computationally expensive - use hyperparameter space random search instead if your search space is very complex

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [5]:
titanic_df = pd.read_csv('titanic_processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,19.0,0,0,7.65,0,0,1
1,0,1,1,38.0,0,1,153.4625,0,0,1
2,0,3,1,17.0,0,0,8.6625,0,0,1
3,0,1,1,62.0,0,0,26.55,0,0,1
4,1,1,0,23.0,3,2,263.0,0,0,1


In [7]:
X = titanic_df.drop('Survived', axis=1)

Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [9]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print("Test data count: ",len(y_test))
    print("accuracy_count : " , num_acc)
    print("accuracy_score : " , acc)
    print("precision_score : " , prec)
    print("recall_score : ", recall)
    print()

In [11]:
#import GridSearchCV -Grid Search Cross Validation
from sklearn.model_selection import GridSearchCV

#make a dict with different values of your hyperparamters
parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

#specifiy the estimator -DecisionTreeClassifier()- 
#pass your parameters -parameters- to build different models each with different hyperparamter
#we will use 3-fold cross validation to find the best model, i.e. the training set will split into three sections
#return_train_score=True will compare different models using the train_score accuracy
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 4}

In [12]:
#make comparison between all models
#Notic the loop counts for how many model you created, in this case we create 6 models
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])

    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7750487329434698
Rank:  5
Parameters:  {'max_depth': 4}
Mean Test Score:  0.8154831523252577
Rank:  1
Parameters:  {'max_depth': 5}
Mean Test Score:  0.806720504966119
Rank:  2
Parameters:  {'max_depth': 7}
Mean Test Score:  0.7820941241993874
Rank:  3
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7627494662582383
Rank:  6
Parameters:  {'max_depth': 10}
Mean Test Score:  0.77857606980414
Rank:  4


In [14]:
#let build a model with max_depth: 4
decision_tree_model = DecisionTreeClassifier( \
    max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [15]:
y_pred = decision_tree_model.predict(x_test)

In [16]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy_count :  118
accuracy_score :  0.8251748251748252
precision_score :  0.8148148148148148
recall_score :  0.7457627118644068



In [17]:
# lets tune the hyperparameters for logistic regression
#we will tune among different penalizers l1, l2 and different values for c
parameters = {'penalty': ['l1', 'l2'], 
              'C': [0.1, 0.4, 0.8, 1, 2, 5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.8, 'penalty': 'l1'}

In [19]:
#make comparison between all models
#Notic the loop counts for how many model you created, in this case we create 12 models

for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7715213960827997
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7785946347349856
Rank:  11
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7838299452334541
Rank:  10
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7838392276988767
Rank:  9
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7978928803490207
Rank:  1
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7908753364893716
Rank:  8
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7961292119186855
Rank:  3
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7926390049197067
Rank:  5
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7978928803490207
Rank:  1
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.7926390049197067
Rank:  5
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7943933908846189
Rank:  4
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test S

In [None]:
#Build a model wi

logistic_model = LogisticRegression(solver='liblinear', \
    penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C']). \
    fit(x_train, y_train)