In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('titanic/titanic_processed.csv')

In [3]:
X = titanic_df.drop('Survived', axis=1)
Y = titanic_df["Survived"]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [8]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    acc_count = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print('Test data count: ', len(y_test))
    print('accuracy count: ', acc_count)
    print('accuracy score: ', acc)
    print('precision score: ', prec)
    print('recall score: ',recall)
    print()


In [9]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[2,4,5,7,9,10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 7}

In [11]:
for i in range(6):
    print("parameters", grid_search.cv_results_['params'][i])
    print("Mean Test Score", grid_search.cv_results_['mean_test_score'][i])
    print("Rank", grid_search.cv_results_['rank_test_score'][i])

parameters {'max_depth': 2}
Mean Test Score 0.7890652557319223
Rank 6
parameters {'max_depth': 4}
Mean Test Score 0.8242457996843963
Rank 2
parameters {'max_depth': 5}
Mean Test Score 0.8207741576162628
Rank 4
parameters {'max_depth': 7}
Mean Test Score 0.827791701475912
Rank 1
parameters {'max_depth': 9}
Mean Test Score 0.822537826046598
Rank 3
parameters {'max_depth': 10}
Mean Test Score 0.8190012067205049
Rank 5


In [15]:
decision_tree_model = DecisionTreeClassifier( 
    max_depth = grid_search.best_params_['max_depth']
).fit(x_train, y_train)

In [16]:
y_pred = decision_tree_model.predict(x_test)

In [18]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy count:  103
accuracy score:  0.7202797202797203
precision score:  0.7454545454545455
recall score:  0.6119402985074627



In [21]:
parameters = {'penalty':['l1', 'l2'], 'C':[0.1,0.3,0.4,0.5,0.8,1,2,5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'),parameters, cv=3, return_train_score= True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.8, 'penalty': 'l1'}

In [24]:
for i in range(16):
    print("parameters", grid_search.cv_results_['params'][i])
    print("Mean Test Score", grid_search.cv_results_['mean_test_score'][i])
    print("Rank", grid_search.cv_results_['rank_test_score'][i])


parameters {'C': 0.1, 'penalty': 'l1'}
Mean Test Score 0.7591942820012996
Rank 16
parameters {'C': 0.1, 'penalty': 'l2'}
Mean Test Score 0.7749930381509329
Rank 15
parameters {'C': 0.3, 'penalty': 'l1'}
Mean Test Score 0.7962034716420682
Rank 10
parameters {'C': 0.3, 'penalty': 'l2'}
Mean Test Score 0.7838670750951452
Rank 14
parameters {'C': 0.4, 'penalty': 'l1'}
Mean Test Score 0.7997122435718927
Rank 8
parameters {'C': 0.4, 'penalty': 'l2'}
Mean Test Score 0.7926761347813981
Rank 12
parameters {'C': 0.5, 'penalty': 'l1'}
Mean Test Score 0.8014759120022279
Rank 7
parameters {'C': 0.5, 'penalty': 'l2'}
Mean Test Score 0.7926668523159751
Rank 13
parameters {'C': 0.8, 'penalty': 'l1'}
Mean Test Score 0.8137658962220365
Rank 1
parameters {'C': 0.8, 'penalty': 'l2'}
Mean Test Score 0.7961756242457997
Rank 11
parameters {'C': 1, 'penalty': 'l1'}
Mean Test Score 0.8120022277917015
Rank 2
parameters {'C': 1, 'penalty': 'l2'}
Mean Test Score 0.797930010210712
Rank 9
parameters {'C': 2, 'penal

In [25]:
logistic_model = LogisticRegression(solver='liblinear', 
penalty=grid_search.best_params_['penalty'], C= grid_search.best_params_['C']).fit(x_train, y_train)

In [26]:
y_pred = logistic_model.predict(x_test)
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy count:  107
accuracy score:  0.7482517482517482
precision score:  0.7818181818181819
recall score:  0.6417910447761194

