In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score 

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,1,1,32.0,0,0,30.5,1,0,0
1,0,2,1,54.0,1,0,26.0,0,0,1
2,0,3,1,30.5,0,0,8.05,0,0,1
3,0,3,1,1.0,4,1,39.6875,0,0,1
4,1,2,0,28.0,1,0,26.0,0,0,1


In [3]:
X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [15]:
def summarize_classification(y_test, y_pred):
    #normalize = True gets the accuracy in terms of a fraction
    acc = accuracy_score(y_test, y_pred, normalize = True)
    #normalize = False gets the accuracy in terms of the number of correct predictions
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': recall,
        'accuarcy_count': num_acc
    }

In [6]:
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth': [2, 4, 6, 8, 10]}
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'max_depth': 4}

In [11]:
for i in range(5):
    print ("Parameters: ", grid_search.cv_results_['params'][i])
    print ("Mean Test Score: ", grid_search.cv_results_['mean_test_score'][i])
    print ("Rank: ", grid_search.cv_results_['rank_test_score'][i])
    print()

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7504316346421609
Rank:  5

Parameters:  {'max_depth': 4}
Mean Test Score:  0.8084656084656086
Rank:  1

Parameters:  {'max_depth': 6}
Mean Test Score:  0.79963798384851
Rank:  2

Parameters:  {'max_depth': 8}
Mean Test Score:  0.7838485101642996
Rank:  3

Parameters:  {'max_depth': 10}
Mean Test Score:  0.7645038522231505
Rank:  4



In [12]:
decision_tree_model = DecisionTreeClassifier(max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [13]:
y_pred = decision_tree_model.predict(x_test)

In [16]:
summarize_classification(y_test, y_pred)

{'accuracy': 0.7832167832167832,
 'precision': 0.8085106382978723,
 'recall': 0.6333333333333333,
 'accuarcy_count': 112}

In [18]:
#Logistic Regression hyperparameter tuning
parameters = {'penalty': ['l1', 'l2'],
              'C': [0.1, 0.4, 0.8, 1, 2, 5]}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters,cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'C': 5, 'penalty': 'l2'}

In [19]:
for i in range(12):
    print ("Parameters: ", grid_search.cv_results_['params'][i])
    print ("Mean Test Score: ", grid_search.cv_results_['mean_test_score'][i])
    print ("Rank: ", grid_search.cv_results_['rank_test_score'][i])
    print()

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7398403415947276
Rank:  12

Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7503945047804695
Rank:  11

Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7768124013738049
Rank:  5

Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7662396732572171
Rank:  9

Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7785667873387171
Rank:  4

Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7644852872923048
Rank:  10

Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7768124013738049
Rank:  5

Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7662489557226398
Rank:  8

Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7873665645595471
Rank:  3

Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.775030168012624
Rank:  7

Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7891302329898822
Rank:  2

Parameters:  {'C': 5, 'penalty': 'l2'}
M

In [21]:
logistic_model = LogisticRegression(solver='liblinear',\
    penalty = grid_search.best_params_['penalty'],\
    C = grid_search.best_params_['C']).fit(x_train, y_train)

In [22]:
y_pred = logistic_model.predict(x_test)
summarize_classification(y_test, y_pred)

{'accuracy': 0.7902097902097902,
 'precision': 0.7777777777777778,
 'recall': 0.7,
 'accuarcy_count': 113}