In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import  LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [3]:
result_dict = {}

In [4]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    return {'accuracy': acc, 'precision': prec, 'recall': recall, 'accuracy_count': num_acc}

In [6]:
def build_model(classifier_fn, name_of_y_col, name_of_x_cols, dataset, test_frac=0.2):
    X = dataset[name_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)

    model = classifier_fn(x_train, y_train)

    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)

    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)

    pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

    return {'training': train_summary, 'test': test_summary, 'confusion_matrix': model_crosstab}

In [7]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key, '\n')

        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])

        print()

In [8]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    return model

In [11]:
result_dict['survived - logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108



In [13]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    return model

In [15]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn, 'Survived', FEATURES[0:-1], titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108

Classification:  survived - linear_discriminant_analysis 

Training data
accuracy 0.804920913884007
precision 0.7746478873239436
recall 0.7236842105263158
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7547169811320755
recall 0.6666666666666666
accuracy_count 110



In [17]:
def quadratic_discriminant_analysis_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    return model

In [18]:
result_dict['survived - quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_analysis_fn, 'Survived', FEATURES[0:-1], titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108

Classification:  survived - linear_discriminant_analysis 

Training data
accuracy 0.804920913884007
precision 0.7746478873239436
recall 0.7236842105263158
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7547169811320755
recall 0.6666666666666666
accuracy_count 110

Classification:  survived - quadratic_discriminant_analysis 

Training data
accuracy 0.8031634446397188
precision 0.7934272300469484
recall 0.7130801687763713
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.6981132075471698
recall 0.7254901960784313
accuracy_count 113



In [24]:
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    return model

In [25]:
result_dict['survived - sgd'] = build_model(sgd_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108

Classification:  survived - linear_discriminant_analysis 

Training data
accuracy 0.804920913884007
precision 0.7746478873239436
recall 0.7236842105263158
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7547169811320755
recall 0.6666666666666666
accuracy_count 110

Classification:  survived - quadratic_discriminant_analysis 

Training data
accuracy 0.8031634446397188
precision 0.7934272300469484
recall 0.7130801687763713
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.6981132075471698
recall 0.7254901960784313
accuracy_count 113

Classification:  survived - sgd 

Training data
accuracy 0.6731107205623902
precision 0.5502793296089385
recall 0.8873873873873874
accuracy_count 383



In [26]:
def svm_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    return model

In [27]:
result_dict['survived - svm'] = build_model(svm_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108

Classification:  survived - linear_discriminant_analysis 

Training data
accuracy 0.804920913884007
precision 0.7746478873239436
recall 0.7236842105263158
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7547169811320755
recall 0.6666666666666666
accuracy_count 110

Classification:  survived - quadratic_discriminant_analysis 

Training data
accuracy 0.8031634446397188
precision 0.7934272300469484
recall 0.7130801687763713
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.6981132075471698
recall 0.7254901960784313
accuracy_count 113

Classification:  survived - sgd 

Training data
accuracy 0.6731107205623902
precision 0.5502793296089385
recall 0.8873873873873874
accuracy_count 383



In [29]:
def radius_neighbor_fn(x_train, y_train, radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    return model

In [31]:
result_dict['survived - radius nn'] = build_model(radius_neighbor_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108

Classification:  survived - linear_discriminant_analysis 

Training data
accuracy 0.804920913884007
precision 0.7746478873239436
recall 0.7236842105263158
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7547169811320755
recall 0.6666666666666666
accuracy_count 110

Classification:  survived - quadratic_discriminant_analysis 

Training data
accuracy 0.8031634446397188
precision 0.7934272300469484
recall 0.7130801687763713
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.6981132075471698
recall 0.7254901960784313
accuracy_count 113

Classification:  survived - sgd 

Training data
accuracy 0.6731107205623902
precision 0.5502793296089385
recall 0.8873873873873874
accuracy_count 383



In [32]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    return model

In [33]:
result_dict['survived - decision tree'] = build_model(decision_tree_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108

Classification:  survived - linear_discriminant_analysis 

Training data
accuracy 0.804920913884007
precision 0.7746478873239436
recall 0.7236842105263158
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7547169811320755
recall 0.6666666666666666
accuracy_count 110

Classification:  survived - quadratic_discriminant_analysis 

Training data
accuracy 0.8031634446397188
precision 0.7934272300469484
recall 0.7130801687763713
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.6981132075471698
recall 0.7254901960784313
accuracy_count 113

Classification:  survived - sgd 

Training data
accuracy 0.6731107205623902
precision 0.5502793296089385
recall 0.8873873873873874
accuracy_count 383



In [34]:
def naive_bayes_fn(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    return model

In [35]:
result_dict['survived - naive bayes'] = build_model(naive_bayes_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived - logistic 

Training data
accuracy 0.7996485061511424
precision 0.7828282828282829
recall 0.6858407079646017
accuracy_count 455

Test data
accuracy 0.7552447552447552
precision 0.7755102040816326
recall 0.6129032258064516
accuracy_count 108

Classification:  survived - linear_discriminant_analysis 

Training data
accuracy 0.804920913884007
precision 0.7746478873239436
recall 0.7236842105263158
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7547169811320755
recall 0.6666666666666666
accuracy_count 110

Classification:  survived - quadratic_discriminant_analysis 

Training data
accuracy 0.8031634446397188
precision 0.7934272300469484
recall 0.7130801687763713
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.6981132075471698
recall 0.7254901960784313
accuracy_count 113

Classification:  survived - sgd 

Training data
accuracy 0.6731107205623902
precision 0.5502793296089385
recall 0.8873873873873874
accuracy_count 383

