In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score 

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,1,1,32.0,0,0,30.5,1,0,0
1,0,2,1,54.0,1,0,26.0,0,0,1
2,0,3,1,30.5,0,0,8.05,0,0,1
3,0,3,1,1.0,4,1,39.6875,0,0,1
4,1,2,0,28.0,1,0,26.0,0,0,1


In [4]:
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
 result_dict = {}

In [6]:
def summarize_classification(y_test, y_pred):
    #normalize = True gets the accuracy in terms of a fraction
    acc = accuracy_score(y_test, y_pred, normalize = True)
    #normalize = False gets the accuracy in terms of the number of correct predictions
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': recall,
        'accuarcy_count': num_acc
    }

In [7]:
def build_model(classifier_fn, 
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_frac)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    #calculate confusion matrix
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training': train_summary,
            'testing': test_summary, 
            'confusion matrix': model_crosstab}

In [8]:
def compare_results():
    for key in result_dict:
        print("Classification: ", key)
        print()
        print("Training data")
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
        print()
        print("Testing data")
        for score in result_dict[key]['testing']:
            print(score, result_dict[key]['testing'][score])
        print()

In [9]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver = 'liblinear')
    model.fit(x_train, y_train)
    return model

In [10]:
result_dict['survived ~ logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108



In [11]:
#LDA finds axes that best separate the classes such that all instances of a class are in same quandrant
#the best axes best separate data into different classes
#svd solver finds axes without calculating covariance matrix for features
#svd is default setting, useful for large numbers of features or lots of rows
def linear_discriminant_fn(x_train, y_train, solver = 'svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train,y_train)
    return model

In [12]:
#FEATURES[0:-1] drops the last feaure, 'Embarked_S', using dummy encoding instead of one-hot encoding
#One-hot encoding can result in collinearity of features, resulting in poor classifiers
#Some scikitlearn Estimators take care of dummy encoding for you, others don't
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                     'Survived',
                                                                     FEATURES[0:-1],
                                                                     titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7751196172248804
recall 0.7105263157894737
accuarcy_count 456

Testing data
accuracy 0.7832167832167832
precision 0.7959183673469388
recall 0.65
accuarcy_count 112



In [13]:
#QDA finds axes that best separate classes, but the decision boundary is quadratic insead of linear
#useful when the X variables have different covariances with respect to the Y variables
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    return model 

In [14]:
#Again QDA is prone to the dummy variable trap, hence we remove the last feature, FEATURES[0:-1]
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                     'Survived',
                                                                     FEATURES[0:-1],
                                                                     titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7751196172248804
recall 0.7105263157894737
accuarcy_count 456

Testing data
accuracy 0.7832167832167832
precision 0.7959183673469388
recall 0.65
accuarcy_count 112

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.789103690685413
precision 0.7431192660550459
recall 0.7168141592920354
accuarcy_count 449

Testing data
accuracy 0.8041958041958042
precision 0.7575757575757576
recall 0.8064516129032258
accuarcy_count 115



In [15]:
#Stochastic Gradient Descent (SGD) Classifier, had to increase max_iter from 1000 to 100000 to increase accuracy
def sgd_fn(x_train, y_train, max_iter = 100000, tol=1e-3):
    model = SGDClassifier(max_iter = max_iter, tol=tol)
    model.fit(x_train, y_train)
    return model

In [16]:
#No problems with dummy variable trap
result_dict['survived ~ sgd'] = build_model(sgd_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7751196172248804
recall 0.7105263157894737
accuarcy_count 456

Testing data
accuracy 0.7832167832167832
precision 0.7959183673469388
recall 0.65
accuarcy_count 112

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.789103690685413
precision 0.7431192660550459
recall 0.7168141592920354
accuarcy_count 449

Testing data
accuracy 0.8041958041958042
precision 0.7575757575757576
recall 0.8064516129032258
accuarcy_count 115

Classification:  survived ~ sgd

Training data
accuracy 0.6889279437609842
precision 0.6911764705882353
recall 0.4104803493449782
accuarcy_count 392

Testing d

In [17]:
#Support Vector Classifier finds a hyperplane that separates points so all points on same side belong to same class
#C is inverse of regularization strength, the penalty for outliers on the wrong side of margin
#small C values indicate stronger regularization
#tolerance determines when model training should be stopped, after no significant improvement
#use dual=false when the number of samples is greater than the number of features.
def linear_svc_fn(x_train, y_train, C=1.0, max_iter = 1000, tol=1e-3):
    model = LinearSVC(C=C, max_iter = max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    return model

In [18]:
result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7751196172248804
recall 0.7105263157894737
accuarcy_count 456

Testing data
accuracy 0.7832167832167832
precision 0.7959183673469388
recall 0.65
accuarcy_count 112

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.789103690685413
precision 0.7431192660550459
recall 0.7168141592920354
accuarcy_count 449

Testing data
accuracy 0.8041958041958042
precision 0.7575757575757576
recall 0.8064516129032258
accuarcy_count 115

Classification:  survived ~ sgd

Training data
accuracy 0.6889279437609842
precision 0.6911764705882353
recall 0.4104803493449782
accuarcy_count 392

Testing d

In [21]:
def radius_neighbor_fn(x_train, y_train, radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    return model


In [22]:
result_dict['survived ~ radius_neighbor'] = build_model(radius_neighbor_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7751196172248804
recall 0.7105263157894737
accuarcy_count 456

Testing data
accuracy 0.7832167832167832
precision 0.7959183673469388
recall 0.65
accuarcy_count 112

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.789103690685413
precision 0.7431192660550459
recall 0.7168141592920354
accuarcy_count 449

Testing data
accuracy 0.8041958041958042
precision 0.7575757575757576
recall 0.8064516129032258
accuarcy_count 115

Classification:  survived ~ sgd

Training data
accuracy 0.6889279437609842
precision 0.6911764705882353
recall 0.4104803493449782
accuarcy_count 392

Testing d

In [23]:
#Decision trees set up a tree structure on training data which helps make decisions based on rules
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    return model

In [24]:
result_dict['survived ~ decision_tree'] = build_model(decision_tree_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)
#this model overfits in training phase
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7751196172248804
recall 0.7105263157894737
accuarcy_count 456

Testing data
accuracy 0.7832167832167832
precision 0.7959183673469388
recall 0.65
accuarcy_count 112

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.789103690685413
precision 0.7431192660550459
recall 0.7168141592920354
accuarcy_count 449

Testing data
accuracy 0.8041958041958042
precision 0.7575757575757576
recall 0.8064516129032258
accuarcy_count 115

Classification:  survived ~ sgd

Training data
accuracy 0.6889279437609842
precision 0.6911764705882353
recall 0.4104803493449782
accuarcy_count 392

Testing d

In [29]:
#naive bayes is called naive because it makes strong assumptions that features are independent of each other
def naive_bayes_fn(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    return model

In [30]:
result_dict['survived ~ naive_bayes'] = build_model(naive_bayes_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7777777777777778
recall 0.7030567685589519
accuarcy_count 455

Testing data
accuracy 0.7552447552447552
precision 0.7142857142857143
recall 0.6779661016949152
accuarcy_count 108

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7751196172248804
recall 0.7105263157894737
accuarcy_count 456

Testing data
accuracy 0.7832167832167832
precision 0.7959183673469388
recall 0.65
accuarcy_count 112

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.789103690685413
precision 0.7431192660550459
recall 0.7168141592920354
accuarcy_count 449

Testing data
accuracy 0.8041958041958042
precision 0.7575757575757576
recall 0.8064516129032258
accuarcy_count 115

Classification:  survived ~ sgd

Training data
accuracy 0.6889279437609842
precision 0.6911764705882353
recall 0.4104803493449782
accuarcy_count 392

Testing d