In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


from sklearn.model_selection import train_test_split #splitting data
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer # scoring
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
%matplotlib inline
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)


In [None]:

#Clean Data
raw = pd.read_csv('train.csv')

#Unnecessary features
del raw['Cabin']
del raw['Ticket']
del raw['Name']
del raw['PassengerId']

raw = raw.dropna(how = 'any')



#on-hot enconde all features that are categorical


raw['Parch_0'] = raw['Parch'].replace([0,1,2],[1,0,0])
raw['Parch_1'] = raw['Parch'].replace([0,1,2],[0,1,0])
raw['Parch_2'] = raw['Parch'].replace([0,1,2],[0,0,1])

raw['Pclass_1'] = raw['Pclass'].replace([1,2,3],[1,0,0])
raw['Pclass_2'] = raw['Pclass'].replace([1,2,3],[0,1,0])
raw['Pclass_3'] = raw['Pclass'].replace([1,2,3],[0,0,1])

raw['SibSp_0'] = raw['SibSp'].replace([0,1,2,3,4,5],[1,0,0,0,0,0])
raw['SibSp_1'] = raw['SibSp'].replace([0,1,2,3,4,5],[0,1,0,0,0,0])
raw['SibSp_2'] = raw['SibSp'].replace([0,1,2,3,4,5],[0,0,1,0,0,0])
raw['SibSp_3'] = raw['SibSp'].replace([0,1,2,3,4,5],[0,0,0,1,0,0])
raw['SibSp_4'] = raw['SibSp'].replace([0,1,2,3,4,5],[0,0,0,0,1,0])
raw['SibSp_5'] = raw['SibSp'].replace([0,1,2,3,4,5],[0,0,0,0,0,1])
del raw['SibSp']

del raw['Parch']
del raw['Pclass']


#Set up inputs and outputs
X = raw.drop('Survived',axis=1).copy()
X_encoded = pd.get_dummies(X, columns=['Embarked','Sex'])
y = raw['Survived'].copy()

raw.head()


In [None]:
sum(y)/len(y)

In [None]:
#BUILD LE TREE
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y, random_state = 42, stratify = y)
X_train.shape

In [None]:
#Check to see if both the train set and test set have the same percentage of people who survived
(sum(y_test)/len(y_test)) == (sum(y_train)/len(y_train))

In [None]:
#BUILD LE TREE
clf_xgb= xgb.XGBClassifier(objective ='binary:logistic', seed = 42)
clf_xgb.fit(X_train,
            y_train,
            verbose = True,
            early_stopping_rounds=10,
            eval_metric ='aucpr',
            eval_set = [(X_test,y_test)])

In [None]:
plot_confusion_matrix(clf_xgb,
                      X_test,
                      y_test,
                      values_format = 'd')

In [None]:
#Optimize: --DOES NOT OPTIMIZE :(

##Step 1
param_grid1 = {
    'max_depth':[4,5,6,7],
    'learn_rate':[0.01,0.1,0.3,0.5],
    'gamma':[0,0.01,0.25,1],
    'reg_lambda':[0,0.01,0.11,5,],
    'scale_pos_weight':[1,3,5]
}

param_grid2 = {
    'max_depth':[7],
    'learn_rate':[0.001,0.01,0.1],
    'gamma':[0.02,0.25,1],
    'reg_lambda':[5],
    'scale_pos_weight':[1,3]
}


optimal_params = GridSearchCV(
    estimator = xgb.XGBClassifier(objective= 'binary:logistic',
                                  seed= 42,
                                  subsample= 0.9,
                                  colsample_bytree= 0.5),
    param_grid = param_grid2,
    scoring= 'roc_auc',
    verbose = 0,
    n_jobs = 10,
    cv = 10
)


optimal_params.fit(
            X_train,
            y_train,
            verbose = True,
            early_stopping_rounds=10,
            eval_metric ='aucpr',
            eval_set = [(X_test,y_test)]
)

print(optimal_params.best_params_)

In [None]:
#Final model --PERFORMES POORLY COMPARED TO INITIAL
clf_xgb= xgb.XGBClassifier(objective ='binary:logistic', 
                           seed = 42,
                           gamma =0.25,
                           learn_rate = 0.001,
                           max_depth = 6,
                           reg_lambda = 5,
                           scale_pos_weight= 3,
                           subsample = 0.9,
                           colsample_bytree = 0.5)
clf_xgb.fit(X_train,
            y_train,
            verbose = True,
            early_stopping_rounds=10,
            eval_metric ='aucpr',
            eval_set = [(X_test,y_test)])

In [None]:
#Print confusion tree again
plot_confusion_matrix(clf_xgb,
                      X_test,
                      y_test,
                      values_format = 'd')