In [39]:
## Purpose: The purpose of this notebook is to build 4 classifier models, random forest, gradient boosted decision tree, log regression, and svc
## Goal: the goal is to find a model with >80% accuracy and high precision 

In [40]:
## import usual packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
## import machine learning packages
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

## Random Forest

In [42]:
## import dataset
tree = pd.read_csv("train_tree.csv")
tree.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Family_size,Parent/Child?,Sex_male,Embarked_Q,Embarked_S,Prefix_Mr,Prefix_Mrs,Prefix_Ms,Sex_pclass_B,Sex_pclass_C,Sex_pclass_D,Sex_pclass_E,Sex_pclass_F
0,0,3,22.0,7.25,1,0,1,0,1,1,0,0,0,0,0,0,1
1,1,1,38.0,71.2833,1,0,0,0,0,0,1,0,0,0,0,0,0
2,1,3,26.0,7.925,0,0,0,0,1,0,0,1,0,1,0,0,0
3,1,1,35.0,53.1,1,0,0,0,1,0,1,0,0,0,0,0,0
4,0,3,35.0,8.05,0,0,1,0,1,1,0,0,0,0,0,0,1


In [43]:
## split into X and y 
X = tree.iloc[:, 1:]
y = tree.iloc[:, 0]

In [44]:
X.head()

Unnamed: 0,Pclass,Age,Fare,Family_size,Parent/Child?,Sex_male,Embarked_Q,Embarked_S,Prefix_Mr,Prefix_Mrs,Prefix_Ms,Sex_pclass_B,Sex_pclass_C,Sex_pclass_D,Sex_pclass_E,Sex_pclass_F
0,3,22.0,7.25,1,0,1,0,1,1,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,0,0,0,1,0,0,0,0,0,0
2,3,26.0,7.925,0,0,0,0,1,0,0,1,0,1,0,0,0
3,1,35.0,53.1,1,0,0,0,1,0,1,0,0,0,0,0,0
4,3,35.0,8.05,0,0,1,0,1,1,0,0,0,0,0,0,1


In [45]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [46]:
## split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [47]:
X_train.shape

(535, 16)

In [48]:
## instantiate and fit Random Forest model
forest = RandomForestClassifier(n_estimators = 10000, max_features = 4, max_depth = 4)
forest.fit(X_train, y_train)
predictions = forest.predict(X_test)

In [49]:
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.839
Accuracy on test set: 0.771


In [50]:
## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion

array([[99,  5],
       [36, 39]], dtype=int64)

In [51]:
## Max accuracy using decision tree seems to be between 78 - 80%

## Gradient Boosted decision tree

In [52]:
## We can use the same train and test sets from above

In [53]:
## instantiate model
gbrt = GradientBoostingClassifier(max_depth = 2, learning_rate = .1)

In [54]:
## Fit model
gbrt.fit(X_train, y_train)
predictions = gbrt.predict(X_test)

In [55]:
## Get accuracy
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))

Accuracy on training set: 0.880
Accuracy on test set: 0.821


In [56]:
## Significantly higher than RF

In [57]:
## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion

array([[95,  9],
       [23, 52]], dtype=int64)

## Logistic Regression

In [92]:
## Read in linear test set
train_linear = pd.read_csv("train_linear.csv")
train_linear.head()

Unnamed: 0,Survived,Age,Fare,Family_size,Sex_male,Pclass_2,Pclass_3,Prefix_Mr,Prefix_Mrs,Prefix_Ms,Sex_pclass_B,Sex_pclass_C,Sex_pclass_D,Sex_pclass_E,Sex_pclass_F
0,0,22.0,7.25,1,1,0,1,1,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,0,0,1,0,0,0,0,0,0
2,1,26.0,7.925,0,0,0,1,0,0,1,0,1,0,0,0
3,1,35.0,53.1,1,0,0,0,0,1,0,0,0,0,0,0
4,0,35.0,8.05,0,1,0,1,1,0,0,0,0,0,0,1


In [93]:
## split data into X and y
X = train_linear.iloc[:, 1:]
y = train_linear.iloc[:, 0]

In [94]:
## Take log of age and fare
X['Age'] = np.log(X['Age'])
X['Fare'] = np.log(X['Fare'] +1)

In [95]:
## Scale data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [96]:
## Create bins for fare and age categories
kb = KBinsDiscretizer(n_bins = 5, strategy = 'uniform')
X_binned = kb.fit_transform(X_scaled)

In [97]:
## split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_binned, y)

In [98]:
## instantiate model
logreg = LogisticRegression(C = 10, penalty = 'l2', solver = 'liblinear').fit(X_train, y_train)
predictions = logreg.predict(X_test)

In [99]:
print("Accuracy on training set: {:.3f}".format(logreg.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(logreg.score(X_test, y_test)))

Accuracy on training set: 0.824
Accuracy on test set: 0.821


In [100]:
## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion

array([[103,   2],
       [ 30,  44]], dtype=int64)

## SVC

In [113]:
## Read in linear test set
train_linear = pd.read_csv("train_linear.csv")
train_linear.head()

Unnamed: 0,Survived,Age,Fare,Family_size,Sex_male,Pclass_2,Pclass_3,Prefix_Mr,Prefix_Mrs,Prefix_Ms,Sex_pclass_B,Sex_pclass_C,Sex_pclass_D,Sex_pclass_E,Sex_pclass_F
0,0,22.0,7.25,1,1,0,1,1,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,0,0,1,0,0,0,0,0,0
2,1,26.0,7.925,0,0,0,1,0,0,1,0,1,0,0,0
3,1,35.0,53.1,1,0,0,0,0,1,0,0,0,0,0,0
4,0,35.0,8.05,0,1,0,1,1,0,0,0,0,0,0,1


In [114]:
## split data into X and y
X = train_linear.iloc[:, 1:]
y = train_linear.iloc[:, 0]

In [115]:
## Try log for SVC
X['Age'] = np.log(X['Age'])
X['Fare'] = np.log(X['Fare'] +1)

In [116]:
## Scale data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [117]:
## Create bins for fare and age categories
kb = KBinsDiscretizer(n_bins = 5, strategy = 'uniform')
X_binned = kb.fit_transform(X_scaled)

In [118]:
## split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

In [119]:
## instantiate and fit svc model
svm = SVC(kernel = 'rbf', C = 1, gamma = 10).fit(X_train, y_train)
predictions = svm.predict(X_test)

In [120]:
## Get accuracy
print("Accuracy on training set: {:.3f}".format(svm.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(svm.score(X_test, y_test)))

Accuracy on training set: 0.839
Accuracy on test set: 0.832


In [121]:
## Looks like logreg and svc are able to perform around the same level (anywhere from .77 to .84)

In [122]:
## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion

array([[95,  8],
       [22, 54]], dtype=int64)

## Model Evaluation and Improvement

In [78]:
## Every time the notebook is run, the accuracy changes for the models. Some runs they all are around 77%, some runs over 80%
## Ideally, running cross validation will correct for this and give us a better idea of what model actually works best

In [79]:
## import necessary packages
## StratifiedKFold will be use becuase we are evaluating classifiers and the data is disproprtional 
from sklearn.model_selection import StratifiedKFold

## RF Grid Search and Cross-Validation

In [152]:
## We will now split into train, validation, and test sets
X = tree.iloc[:, 1:]
y = tree.iloc[:, 0]

In [153]:
## Split into training+validation and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y)

## Split trainval into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval)

In [154]:
## Create grid search for loop
best_score = 0

for max_features in [1,2,3,4,5]:
    for max_depth in [1,2,3,4,5]: 
        ##instantiate new model
        forest = RandomForestClassifier(n_estimators = 1000, max_features = max_features, max_depth = max_depth)
        ## perform cross-vald
        skf = StratifiedKFold(n_splits = 5) 
        scores_skf = cross_val_score(forest, X_trainval, y_trainval, cv = skf)
        ## compute mean cross-val accuracy
        score_skf = np.mean(scores_skf)
        ## if score improved, store the score and parameters
        if score_skf > best_score: 
            best_score = score_skf
            best_parameters = {'max_features': max_features, 'max_depth': max_depth}

## Print best score         
print("Best Score: {:.3f}".format(best_score))

## Rebuild model with best parameters 
forest = RandomForestClassifier(n_estimators = 10000, **best_parameters)
forest.fit(X_trainval, y_trainval)
predictions = forest.predict(X_test)

## Get accuracy on test set (Training set accuracy is best score above)
print("Accuracy on test set: {:.3f}".format(forest.score(X_test,y_test)))

## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion

Best Score: 0.813
Accuracy on test set: 0.777


array([[98,  9],
       [31, 41]], dtype=int64)

## GBR Grid Search and Cross-Validation

In [150]:
## Will use the same splits as above

In [155]:
best_score = 0

for max_depth in [1,2,3,4,5,6]:
    for learning_rate in [.001,.01,.1,1,10,100]:
        ## instantiate new model
        gbr = GradientBoostingClassifier(max_depth = max_depth, learning_rate = learning_rate)
        ## perform cross-validation
        skf = StratifiedKFold(n_splits = 5)
        scores_skf = cross_val_score(gbr, X_trainval, y_trainval, cv = skf)
        ## compute mean cross-val accuracy
        score_skf = np.mean(scores_skf)
        ## if score improved, store the score and parameters
        if score_skf > best_score:
            best_score = score_skf
            best_parameters = {'max_depth': max_depth, 'learning_rate': learning_rate}
            
## Print best score
print("Best Score: {:.3f}".format(best_score))

## Rebuild model with best parameters
gbr = GradientBoostingClassifier(**best_parameters)
gbr.fit(X_trainval, y_trainval)
predictions = gbr.predict(X_test)

## Get accuracy on test set
print("Accuracy on test set: {:.3f}".format(gbr.score(X_test, y_test)))
print("Best parameters: ", best_parameters)

## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion

Best Score: 0.821
Accuracy on test set: 0.771
Best parameters:  {'max_depth': 4, 'learning_rate': 0.01}


array([[103,   4],
       [ 37,  35]], dtype=int64)

## Log Reg Grid Search and Cross-Validation

In [156]:
## split data into X and y
X = train_linear.iloc[:, 1:]
y = train_linear.iloc[:, 0]

In [157]:
## Take log of age and fare
X['Age'] = np.log(X['Age'])
X['Fare'] = np.log(X['Fare'] +1)

In [158]:
## Data transformations

## Scale data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

## Create bins for fare and age categories
kb = KBinsDiscretizer(n_bins = 5, strategy = 'uniform')
X_binned = kb.fit_transform(X_scaled)


In [159]:
## split into train and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X_binned, y)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval)

In [160]:
best_score = 0

for C in [.01, .1, 1, 10, 100]:
    for penalty in ['l1', 'l2']:
                
        ## instantiate model
        logreg = LogisticRegression(C = C, penalty = penalty, solver = 'liblinear')
            
        ## perform cross-validation
        skf = StratifiedKFold(n_splits = 5)
        scores_skf = cross_val_score(logreg, X_trainval, y_trainval, cv = skf)
            
        ## get mean scores
        score_skf = np.mean(scores_skf)
            
         ## if score improved, store the score and parameters
        if score_skf > best_score:
            best_score = score_skf
            best_parameters = {'C': C, 'penalty': penalty}
            best_transform = transform
            
## Print best score and best parameters
print("Best Score: {:.3f}".format(best_score))
print("Best parameters: ", best_parameters)

## Rebuild model with best parameters
logreg = LogisticRegression(**best_parameters, solver = 'liblinear')
logreg.fit(X_trainval, y_trainval)
predictions = logreg.predict(X_test)

## Get accuracy on test set
print("Accuracy on test set: {:.3f}".format(logreg.score(X_test, y_test)))

## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion

Best Score: 0.811
Best parameters:  {'C': 10, 'penalty': 'l1'}
Accuracy on test set: 0.788


array([[100,   9],
       [ 29,  41]], dtype=int64)

## SVC Grid Search and Cross-Validation

In [228]:
## Perform the same data transformations as LogReg

## split data into X and y
X = train_linear.iloc[:, 1:]
y = train_linear.iloc[:, 0]

## Take log of age and fare
X['Age'] = np.log(X['Age'])
X['Fare'] = np.log(X['Fare'] +1)

## Data transformations

## Scale data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

## Create bins for fare and age categories
kb = KBinsDiscretizer(n_bins = 5, strategy = 'uniform')
X_binned = kb.fit_transform(X_scaled)


In [229]:
## split into train and test sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X_binned, y)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval)

In [230]:
best_score = 0

## Create grid search 
for C in [.01, .1, 1, 10, 100]:
    for gamma in [.01, .1, 1, 10, 100]:
        
        ##instantiate model
        svc = SVC(kernel = 'rbf', C = C, gamma = gamma)
        
        ## perform cross-validation
        skf = StratifiedKFold(n_splits = 5)
        scores_skf = cross_val_score(svc, X_trainval, y_trainval, cv = skf)
            
        ## get mean scores
        score_skf = np.mean(scores_skf)
            
         ## if score improved, store the score and parameters
        if score_skf > best_score:
            best_score = score_skf
            best_parameters = {'C': C, 'gamma': gamma}
            best_transform = transform
            
## Print best score and best parameters
print("Best Score: {:.3f}".format(best_score))
print("Best parameters: ", best_parameters)

## Rebuild model with best parameters
svc = SVC(**best_parameters, kernel = 'rbf')
svc.fit(X_trainval, y_trainval)
predictions = svc.predict(X_test)

## Get accuracy on test set
print("Accuracy on test set: {:.3f}".format(svc.score(X_test, y_test)))

## Print confusion matrix - FPs are top right (trying to limit these)
confusion = confusion_matrix(y_test, predictions)
confusion
        

Best Score: 0.807
Best parameters:  {'C': 100, 'gamma': 0.01}
Accuracy on test set: 0.832


array([[105,   5],
       [ 25,  44]], dtype=int64)

In [214]:
## Conclusion: SVC performed by far the best (.832), and only had 5 FPs. Will use for final model

## Final Test Set Predictions

In [248]:
## import final test dataframe from cleaning notebook
test_final = pd.read_csv('test_cleaned.csv')
test_final.head()

Unnamed: 0,Age,Fare,Family_size,Sex_male,Pclass_2,Pclass_3,Prefix_Mr,Prefix_Mrs,Prefix_Ms,Sex_pclass_B,Sex_pclass_C,Sex_pclass_D,Sex_pclass_E,Sex_pclass_F
0,34.5,7.8292,0,1,0,1,1,0,0,0,0,0,0,1
1,47.0,7.0,1,0,0,1,0,1,0,0,1,0,0,0
2,62.0,9.6875,0,1,1,0,1,0,0,0,0,0,1,0
3,27.0,8.6625,0,1,0,1,1,0,0,0,0,0,0,1
4,22.0,12.2875,2,0,0,1,0,1,0,0,1,0,0,0


In [249]:
## Take log of age and fare
test_final['Age'] = np.log(test_final['Age'])
test_final['Fare'] = np.log(test_final['Fare'] +1)

In [250]:
## Transform test set
test_final = scaler.transform(test_final)
test_final = kb.transform(test_scaled)

In [251]:
## Make predictions
predictions_svc = svc.predict(test_final)
predictions_svc

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [252]:
predictions_svc = pd.Series(predictions_svc)
predictions_svc = pd.DataFrame(predictions_svc)
predictions_svc.columns = ['Survived']
predictions_svc.head()

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1


In [236]:
## import passenger ids
passengers = pd.read_csv("passengers.csv")
passengers.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [253]:
predictions_svc = pd.concat([passengers, predictions_svc], axis = 1)

In [254]:
predictions_svc.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [258]:
predictions_svc.to_csv("predictions_svc.csv", index = False)