In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
  
    
#import all classifiers!

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('titanic_processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,19.0,0,0,7.65,0,0,1
1,0,1,1,38.0,0,1,153.4625,0,0,1
2,0,3,1,17.0,0,0,8.6625,0,0,1
3,0,1,1,62.0,0,0,26.55,0,0,1
4,1,1,0,23.0,3,2,263.0,0,0,1


In [4]:
#list all of features, from column 1(index start at 0) all the way to end
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
# create a dict to store every Acc,precision,Recall for every classifier to later comparison
result_dict = {}

In [6]:
#define a function to return all our matrics

def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    #normalize = true, will output as a fraction
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    #normalize = false will output the number of accurately predicted labels

    prec = precision_score(y_test, y_pred)
    #intuitively the ability of the classifier not to label as positive a sample that is negative
    recall = recall_score(y_test, y_pred)
    #intuitively the ability of the classifier to find all the positive samples
    
    return {'accuracy': acc, 
            'precision': prec,
            'recall':recall, 
            'accuracy_count':num_acc}
# all results will back in a dict format

In [7]:
#define a function to fit the classifier passing the following arguments:
# classifier_fn : the name of classifier
# name_of_y_col: the target column
# names_of_x_cols: the features columns
#dataset : the dateset
# test_frac : split percentage of testing

#the function will evaluate the model performance on the training data
def build_model(classifier_fn,                
                name_of_y_col, 
                names_of_x_cols, 
                dataset, 
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
       
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)

    y_pred_train = model.predict(x_train)
    
    #summarize_classification is a function defined earlier
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    
    return {'training': train_summary, 
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [8]:
# this function will print a report by accessing every key the dict and print training-score, test-score
def compare_results():
    for key in result_dict:
        print('Classification: ', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
       
        print()

In [9]:
#initiate a logistic regression estimator
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
result_dict['survived ~ logistic'] = build_model(logistic_fn,
                                              'Survived',
                                               FEATURES,
                                               titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115



In [11]:
# LDA assumed the ys corresponding to xs have the same covariant matrix, lets initiate:
#SVD(default):singular value decomposition to calculate the axes to seperate our data without calculating the covariance matrix of feature
#be aware of "dummy trap", if you encountered collinearity problem because of hot-encode, try dropping one column of your hot-encoding columns to turn it into dummy-encoded
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                 'Survived',
                                                                  FEATURES,
                                                                  titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test data
accuracy 0.7622377622377622
precision 0.7115384615384616
recall 0.6607142857142857
accuracy_count 109



In [13]:
#lets test by excluding one hot-encoding column and turn it into dummy-encoded to the see if the model accounted for collinearity.
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                     'Survived',
                                                                      FEATURES[0:-1],
                                                                      titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7707317073170732
recall 0.7085201793721974
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.8431372549019608
recall 0.6615384615384615
accuracy_count 113



In [14]:
#QDA: lets try QDA if our data have a different covariance matrix for Xs variables to different Ys

def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [15]:
#again we will use dummy-encoded instead of hot-encoding
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                        'Survived',
                                                                        FEATURES[0:-1],
                                                                        titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7707317073170732
recall 0.7085201793721974
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.8431372549019608
recall 0.6615384615384615
accuracy_count 113

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7926186291739895
precision 0.7804878048780488
recall 0.6866952789699571
accuracy_count 451

Test data
accuracy 0.7972027972027972
precision 0.7241379310344828
recall 0.7636363636363637
accuracy_count 114



In [22]:
#Stochastic gradient descent
#Warning, before moving on to another classifier change the iteration, because performance could depends on it!
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
     
    return model

In [23]:
result_dict['survived ~ sgd'] = build_model(sgd_fn,
                                           'Survived',
                                            FEATURES,
                                            titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7707317073170732
recall 0.7085201793721974
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.8431372549019608
recall 0.6615384615384615
accuracy_count 113

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7926186291739895
precision 0.7804878048780488
recall 0.6866952789699571
accuracy_count 451

Test data
accuracy 0.7972027972027972
precision 0.7241379310344828
recall 0.7636363636363637
accuracy_count 114

Classification:  survived ~ sgd

Training data
accuracy 0.4727592267135325
precision 0.426
recall 0.9424778761061947
accuracy_count 269

Test data
accura

In [25]:
#SVM find the hyperplane that best seperate between the targeted point, either directly or transforming into higher-plane
#We will use SVM classifier, C=1.0 will penalize the points on the wrong side of the seprator, small values mean stron regularization
# those two functions are equivalent LinearSVC == SVC(kernel='linear')
# Dual=False (prefered, when N_samples>P_predictors) not transforming primal problem to dual problem
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train) 
    
    return model

In [26]:
result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn,
                                                  'Survived',
                                                   FEATURES,
                                                   titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7707317073170732
recall 0.7085201793721974
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.8431372549019608
recall 0.6615384615384615
accuracy_count 113

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7926186291739895
precision 0.7804878048780488
recall 0.6866952789699571
accuracy_count 451

Test data
accuracy 0.7972027972027972
precision 0.7241379310344828
recall 0.7636363636363637
accuracy_count 114

Classification:  survived ~ sgd

Training data
accuracy 0.4727592267135325
precision 0.426
recall 0.9424778761061947
accuracy_count 269

Test data
accura

In [34]:
#Nearest_Neighbor classify based on the nearest distance between sample points, whether Euclidean, Hamming, Manhattan
#we will use Radius_Neighbor instead of K_Neighbor for more robust results
#change the radius and check different results
def radius_neighbor_fn(x_train, y_train, radius=40.0):

    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train) 
    
    return model

In [35]:
result_dict['survived ~ radius_neighbors'] = build_model(radius_neighbor_fn,
                                                         'Survived',
                                                         FEATURES,
                                                         titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7707317073170732
recall 0.7085201793721974
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.8431372549019608
recall 0.6615384615384615
accuracy_count 113

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7926186291739895
precision 0.7804878048780488
recall 0.6866952789699571
accuracy_count 451

Test data
accuracy 0.7972027972027972
precision 0.7241379310344828
recall 0.7636363636363637
accuracy_count 114

Classification:  survived ~ sgd

Training data
accuracy 0.4727592267135325
precision 0.426
recall 0.9424778761061947
accuracy_count 269

Test data
accura

In [36]:
#We will use decision tree to split of P_predictors into subgroups of multiple decisions
#we will not set any shape for the tree, max_depth=None, max_features=None. If the dataset is huge, set constraints, this model could easily overfit!
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None): 
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [37]:
result_dict['survived ~ decision_tree'] = build_model(decision_tree_fn,
                                                 'Survived',
                                                  FEATURES,
                                                  titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7707317073170732
recall 0.7085201793721974
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.8431372549019608
recall 0.6615384615384615
accuracy_count 113

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7926186291739895
precision 0.7804878048780488
recall 0.6866952789699571
accuracy_count 451

Test data
accuracy 0.7972027972027972
precision 0.7241379310344828
recall 0.7636363636363637
accuracy_count 114

Classification:  survived ~ sgd

Training data
accuracy 0.4727592267135325
precision 0.426
recall 0.9424778761061947
accuracy_count 269

Test data
accura

In [38]:
#Naive will use Bayes theorem to find which lable is high probable give the attributes in the feature vector
#you can pass priors if you know any priors about your features.
def naive_bayes_fn(x_train,y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [39]:
result_dict['survived ~ naive_bayes'] = build_model(naive_bayes_fn,
                                                    'Survived',
                                                    FEATURES,
                                                    titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7908611599297012
precision 0.7688679245283019
recall 0.6995708154506438
accuracy_count 450

Test data
accuracy 0.8041958041958042
precision 0.7755102040816326
recall 0.6909090909090909
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7707317073170732
recall 0.7085201793721974
accuracy_count 457

Test data
accuracy 0.7902097902097902
precision 0.8431372549019608
recall 0.6615384615384615
accuracy_count 113

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7926186291739895
precision 0.7804878048780488
recall 0.6866952789699571
accuracy_count 451

Test data
accuracy 0.7972027972027972
precision 0.7241379310344828
recall 0.7636363636363637
accuracy_count 114

Classification:  survived ~ sgd

Training data
accuracy 0.4727592267135325
precision 0.426
recall 0.9424778761061947
accuracy_count 269

Test data
accura