## Import Essential Libraries

In [1]:
# Essentails 
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Machine learning
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score, RepeatedStratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Saving the model
import pickle

## Read in the Data

In [2]:
# Load the training dataset
train = pd.read_csv('train_cleaned.csv')

# Load the testing dataset
test = pd.read_csv('testing_cleaned.csv')

## Feature Matrix and Response Variable

In [3]:
# Response variable
y = train['Survived'].values

# Feature matrix
X = train.drop('Survived', axis=1).values

# Splitting the data into training and cross validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Model Building

In [4]:
def evaluate_model(y_true, y_pred):
    confusion = confusion_matrix(y_true, y_pred)
    
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    print('Accuarcy:', accuracy_score(y_true, y_pred))
    print('Sensitivity:', recall_score(y_true, y_pred))
    print('Specificity:', TN / (TN + FP))
    print('False Positive Rate:', FP / (TN + FP))
    print('Precision:', precision_score(y_true, y_pred))
    print('F1 Score:', f1_score(y_true, y_pred))

In [5]:
models = pd.DataFrame(['Accuarcy', 'Sensitivity', 'Specificity', 'FPR', 'Precision', 'F1 Score'])

In [6]:
models.columns = ['Metrics']

In [7]:
def add_model_to_df(y_true, y_pred, dataframe, modelName):
    confusion = confusion_matrix(y_true, y_pred)
    
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    accuarcy = accuracy_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred)
    specificity = TN / (TN + FP)
    fpr = FP / (TN + FP)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    dataframe[modelName] = np.array([accuarcy, sensitivity, specificity, fpr, precision, f1])

### Logistic Regression

In [8]:
# Define the parameter values that need to be searched
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
penalty = ['l1', 'l2', 'elasticnet', 'none']
max_iter = list(range(100,500,100))
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

In [9]:
# Map parameter names to the values that need to be searched
param_dict = dict(C=C, penalty=penalty, max_iter=max_iter, solver=solver)

In [10]:
# Instantiate the LogisticRegression model using the default parameters
logreg = LogisticRegression(random_state=42)

# 10-fold stratified cross-validation with 3 repeats
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# Evaluate the accuarcy of the naive model using cross-validation 
scores = cross_val_score(logreg, X_train, y_train, cv=cv, scoring='accuracy')
print('Accuracy (Default):', scores.mean())

Accuracy (Default): 0.8073552425665103


In [11]:
# Instantiate the grid
grid_logreg = GridSearchCV(logreg, param_dict, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid to the training data
grid_logreg.fit(X_train, y_train)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=LogisticRegression(random_state=42), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'max_iter': [100, 200, 300, 400],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             scoring='accuracy')

In [12]:
# Best model score and parameters
print(grid_logreg.best_score_)
print(grid_logreg.best_params_)

0.8073552425665103
{'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}


In [13]:
# Instantiate the logistic regression model using the best known parameters
logreg = LogisticRegression(C=1, max_iter=100, penalty='l2', solver='saga', random_state=42)

# Evaluate the accuracy of the tuned model using cross-validation
scores = cross_val_score(logreg, X_train, y_train, cv=cv, scoring='accuracy')
print('Accuracy (Tuned):', scores.mean())

Accuracy (Tuned): 0.8073552425665103


In [14]:
# Fit the logistic regression model on the training data, using the best known parameters
logreg.fit(X_train, y_train)

# Testing the model on the hold out set, since they are truly out of sample  
y_pred = logreg.predict(X_val)

In [15]:
# Evaluating the hold out set predictions  
evaluate_model(y_val, y_pred)

Accuarcy: 0.8033707865168539
Sensitivity: 0.7352941176470589
Specificity: 0.8454545454545455
False Positive Rate: 0.15454545454545454
Precision: 0.746268656716418
F1 Score: 0.7407407407407408


In [16]:
add_model_to_df(y_val, y_pred, models, 'Logistic Regression')

### SDG Classifier

In [17]:
# Instantiate the SGDClassifier model using the default parameters
sgd = SGDClassifier(random_state=42)

# 10-fold stratified cross-validation with 3 repeats
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# Evaluate the accuarcy of the naive model using cross-validation 
scores = cross_val_score(sgd, X_train, y_train, cv=cv, scoring='accuracy')
print('Accuracy (Default):', scores.mean())

Accuracy (Default): 0.7534298382889932


In [18]:
# Define the parameter values that need to be searched
loss = ['hinge', 'log', 'squared_hinge', 'modified_huber']
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
penalty = ['l1', 'l2', 'none', 'elasticnet']
l1_ratio = [0.1, 0.3, 0.5, 0.7, 0.9]
max_iter = list(range(200,2000,200))

In [19]:
# Map parameter names to the values that need to be searched
param_dict = dict(loss=loss, alpha=alpha, penalty=penalty, l1_ratio=l1_ratio, max_iter=max_iter)

In [20]:
# Instantiate the grid
grid_sgd = GridSearchCV(sgd, param_dict, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid to the training data
grid_sgd.fit(X_train, y_train)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=SGDClassifier(random_state=42), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
                         'loss': ['hinge', 'log', 'squared_hinge',
                                  'modified_huber'],
                         'max_iter': [200, 400, 600, 800, 1000, 1200, 1400,
                                      1600, 1800],
                         'penalty': ['l1', 'l2', 'none', 'elasticnet']},
             scoring='accuracy')

In [21]:
# Best model score and parameters
print(grid_sgd.best_score_)
print(grid_sgd.best_params_)

0.804518779342723
{'alpha': 0.01, 'l1_ratio': 0.1, 'loss': 'log', 'max_iter': 200, 'penalty': 'l2'}


In [22]:
# Instantiate the SGDClassifier model using the best known parameters
sgd = SGDClassifier(alpha=0.01, l1_ratio=0.1, loss='log', max_iter=200, penalty='l2', random_state=42)

# Evaluate the accuracy of the tuned model using cross-validation
scores = cross_val_score(sgd, X_train, y_train, cv=cv, scoring='accuracy')
print('Accuracy (Tuned):', scores.mean())

Accuracy (Tuned): 0.804518779342723


In [23]:
# Fit the stochastic gradient classifier on the training data, using the best known parameters
sgd.fit(X_train, y_train)

# Testing the model on the hold out set, since they are truly out of sample  
y_pred = sgd.predict(X_val)

In [24]:
# Evaluating the hold out set predictions  
evaluate_model(y_val, y_pred)

Accuarcy: 0.7808988764044944
Sensitivity: 0.75
Specificity: 0.8
False Positive Rate: 0.2
Precision: 0.6986301369863014
F1 Score: 0.7234042553191489


In [25]:
add_model_to_df(y_val, y_pred, models, 'SGD Classfier')

### Gradient Boosting

In [26]:
# Instantiate the GradientBoostingClassifier  model using the default parameters
grad = GradientBoostingClassifier(random_state=42)

# 10-fold stratified cross-validation with 3 repeats
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# Evaluate the accuarcy of the naive model using cross-validation 
scores = cross_val_score(grad, X_train, y_train, cv=cv, scoring='accuracy')
print('Average Score:', scores.mean())

Average Score: 0.8256194574856547


In [27]:
# Define the parameter values that need to be searched
max_features = ['auto', 'sqrt', 'log2']
loss = ['deviance', 'exponential']
learning_rate = [0.001, 0.01, 0.1, 1]
n_estimators = [50, 100, 200, 400, 800]

In [28]:
# Map parameter names to the values that need to be searched
param_dict = dict(max_features=max_features, loss=loss, learning_rate=learning_rate, n_estimators=n_estimators)

In [29]:
# Instantiate the grid
grid_grad = GridSearchCV(grad, param_dict, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid to the training data
grid_grad.fit(X_train, y_train)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=GradientBoostingClassifier(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 1],
                         'loss': ['deviance', 'exponential'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [50, 100, 200, 400, 800]},
             scoring='accuracy')

In [30]:
# Best model score and parameters
print(grid_grad.best_score_)
print(grid_grad.best_params_)

0.8327073552425664
{'learning_rate': 0.1, 'loss': 'deviance', 'max_features': 'sqrt', 'n_estimators': 400}


In [31]:
# Instantiate the GradientBoostingClassifier model using the best known parameters
grad = GradientBoostingClassifier(learning_rate=0.1, loss='deviance', max_features='sqrt', n_estimators=400, random_state=42)

# Evaluate the accuracy of the tuned model using cross-validation
scores = cross_val_score(grad, X_train, y_train, cv=cv, scoring='accuracy')
print('Accuracy (Tuned):', scores.mean())

Accuracy (Tuned): 0.8327073552425664


In [32]:
# Fit the gradient boosting classifier on the training data, using the best known parameters
grad.fit(X_train, y_train)

# Testing the model on the hold out set, since they are truly out of sample  
y_pred = grad.predict(X_val)

In [33]:
# Evaluating the hold out set predictions  
evaluate_model(y_val, y_pred)

Accuarcy: 0.8258426966292135
Sensitivity: 0.7352941176470589
Specificity: 0.8818181818181818
False Positive Rate: 0.11818181818181818
Precision: 0.7936507936507936
F1 Score: 0.7633587786259542


In [34]:
add_model_to_df(y_val, y_pred, models, 'Gradient Boosting')

### Decision Tree Classifier

In [35]:
# Instantiate the DecisionTreeClassifier  model using the default parameters
tree = DecisionTreeClassifier(random_state=42)

# 10-fold stratified cross-validation with 3 repeats
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# Evaluate the accuarcy of the naive model using cross-validation 
scores = cross_val_score(tree, X_train, y_train, cv=cv, scoring='accuracy')
print('Accuracy (Default):', scores.mean())

Accuracy (Default): 0.7839136671883152


In [36]:
# Define the parameter values that need to be searched
criterion = ['gini', 'entropy']
max_depth = list(range(1, 10))
min_samples_split = list(range(1, 10))
min_samples_leaf = list(range(1, 5))

In [37]:
# Map parameter names to the values that need to be searched
param_dict = dict(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

In [38]:
# Instantiate the grid
grid_tree = GridSearchCV(tree, param_dict, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid to the training data
grid_tree.fit(X_train, y_train)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'min_samples_leaf': [1, 2, 3, 4],
                         'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
             scoring='accuracy')

In [39]:
# Best model score and parameters
print(grid_tree.best_score_)
print(grid_tree.best_params_)

0.8293883672404797
{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [40]:
# Instantiate the DecisionTreeClassifier  model using the best known parameters
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=1, min_samples_split=2, random_state=42)

# Evaluate the accuracy of the tuned model using cross-validation
scores = cross_val_score(tree, X_train, y_train, cv=cv, scoring='accuracy')
print('Accuracy (Tuned):', scores.mean())

Accuracy (Tuned): 0.8293883672404797


In [41]:
# Fit the decision tree classifier on the training data, using the best known parameters
tree.fit(X_train, y_train)

# Testing the model on the hold out set, since they are truly out of sample  
y_pred = tree.predict(X_val)

In [42]:
# Evaluating the hold out set predictions  
evaluate_model(y_val, y_pred)

Accuarcy: 0.8146067415730337
Sensitivity: 0.7058823529411765
Specificity: 0.8818181818181818
False Positive Rate: 0.11818181818181818
Precision: 0.7868852459016393
F1 Score: 0.7441860465116278


In [43]:
add_model_to_df(y_val, y_pred, models, 'Decision Trees')

### K Nearest Neighbors

In [44]:
# Instantiate the KNeighborsClassifier model using the default parameters
knn = KNeighborsClassifier()

# 10-fold stratified cross-validation with 3 repeats
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# Evaluate the accuarcy of the naive model using cross-validation 
scores = cross_val_score(knn, X_train, y_train, cv=cv, scoring='accuracy')
print('Average Score:', scores.mean())

Average Score: 0.804094940010433


In [45]:
# Define the parameter values that need to be searched
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p = [1,2]

In [46]:
# Map parameter names to the values that need to be searched
param_dict = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

In [47]:
# Instantiate the grid
grid_knn = GridSearchCV(knn, param_dict, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid to the training data
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29],
                         'p': [1, 2]},
             scoring='accuracy')

In [48]:
# Best model score and parameters
print(grid_knn.best_score_)
print(grid_knn.best_params_)

0.8251369327073552
{'leaf_size': 1, 'n_neighbors': 15, 'p': 1}


In [49]:
# Instantiate the KNeighborsClassifier model using the best known parameters
knn = KNeighborsClassifier(leaf_size=1, n_neighbors=15, p=1)

# Evaluate the accuracy of the tuned model using cross-validation
scores = cross_val_score(knn, X_train, y_train, cv=cv, scoring='accuracy')
print('Average Score:', scores.mean())

Average Score: 0.8251369327073552


In [50]:
# Fit the knn classifier on the training data, using the best known parameters
knn.fit(X_train, y_train)

# Testing the model on the hold out set, since they are truly out of sample  
y_pred = knn.predict(X_val)

In [51]:
# Evaluating the hold out set predictions  
evaluate_model(y_val, y_pred)

Accuarcy: 0.8033707865168539
Sensitivity: 0.6029411764705882
Specificity: 0.9272727272727272
False Positive Rate: 0.07272727272727272
Precision: 0.8367346938775511
F1 Score: 0.7008547008547009


In [52]:
add_model_to_df(y_val, y_pred, models, 'KNN')

### Support Vector Machines

In [53]:
# Instantiate the SVC(Support Vector Classifier) model using the default parameters
svc = SVC(random_state=42)

# 10-fold stratified cross-validation with 3 repeats
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# Evaluate the accuarcy of the naive model using cross-validation 
scores = cross_val_score(svc, X_train, y_train, cv=cv, scoring='accuracy')
print('Average Score:', scores.mean())

Average Score: 0.8176317162232656


In [54]:
# Define the parameter values that need to be searched
C = [0.001, 0.01, 0.1, 1, 3, 5, 10]
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
gamma = ['scale', 'auto']

In [55]:
# Map parameter names to the values that need to be searched
param_dict = dict(C=C, gamma=gamma, kernel=kernel)

In [56]:
# Instantiate the grid
grid_svc = GridSearchCV(svc, param_dict, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit the grid to the training data
grid_svc.fit(X_train, y_train)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=42),
             estimator=SVC(random_state=42), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 3, 5, 10],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='accuracy')

In [57]:
# Best model score and parameters
print(grid_svc.best_score_)
print(grid_svc.best_params_)

0.8279407929055816
{'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}


In [58]:
# Instantiate the SVC(Support Vector Classifier) model using the best known parameters
svc = SVC(kernel='rbf', C=3, gamma='scale', random_state=42)

# Evaluate the accuracy of the tuned model using cross-validation
scores = cross_val_score(svc, X, y, cv=cv, scoring='accuracy')
print('Average Score:', scores.mean())

Average Score: 0.8245318352059924


In [59]:
# Fit the SVC on the training data, using the best known parameters
svc.fit(X_train, y_train)

# Testing the model on the hold out set, since they are truly out of sample  
y_pred = svc.predict(X_val)

In [60]:
# Evaluating the hold out set predictions  
evaluate_model(y_val, y_pred)

Accuarcy: 0.8089887640449438
Sensitivity: 0.6176470588235294
Specificity: 0.9272727272727272
False Positive Rate: 0.07272727272727272
Precision: 0.84
F1 Score: 0.711864406779661


In [61]:
add_model_to_df(y_val, y_pred, models, 'SVM')

## Model Comparision

In [62]:
models.index = ['Accuarcy', 'Sensitivity', 'Specificity', 'FPR', 'Precision', 'F1 Score']

In [63]:
models.drop('Metrics', axis=1, inplace=True)

In [64]:
models.T

Unnamed: 0,Accuarcy,Sensitivity,Specificity,FPR,Precision,F1 Score
Logistic Regression,0.803371,0.735294,0.845455,0.154545,0.746269,0.740741
SGD Classfier,0.780899,0.75,0.8,0.2,0.69863,0.723404
Gradient Boosting,0.825843,0.735294,0.881818,0.118182,0.793651,0.763359
Decision Trees,0.814607,0.705882,0.881818,0.118182,0.786885,0.744186
KNN,0.803371,0.602941,0.927273,0.072727,0.836735,0.700855
SVM,0.808989,0.617647,0.927273,0.072727,0.84,0.711864


## Saving the Models

In [65]:
with open(r'App/Models/logreg.pickle', 'wb') as f:
    pickle.dump(logreg, f)

In [66]:
with open(r'App/Models/sgd.pickle', 'wb') as f:
    pickle.dump(sgd, f)

In [67]:
with open(r'App/Models/grad.pickle', 'wb') as f:
    pickle.dump(grad, f)

In [68]:
with open(r'App/Models/tree.pickle', 'wb') as f:
    pickle.dump(tree, f)

In [69]:
with open(r'App/Models/knn.pickle', 'wb') as f:
    pickle.dump(knn, f)

In [70]:
with open(r'App/Models/svc.pickle', 'wb') as f:
    pickle.dump(svc, f)