In [229]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [230]:
# Load dataset
df = pd.read_csv(r'C:\Users\marku\Desktop\ML\MLGit\datasets\titanic.csv')

In [231]:
# Sets null values equal to the median of that column
def handle_null_median(df):
    # Need to set inplace=True, so it doesn't create a copy of the dataframe. Tried without and this led to null-values not being removed
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace=True)

    df['Embarked'].fillna('S', inplace=True)

    return  df

df = handle_null_median(df)
df['Sex'] = df['Sex'].replace(['female', 'male'], [0,1])
df['FamilyMembersCount'] = df['Parch'] + df['SibSp'] + 1
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1)

In [232]:
def bins(df): # This is kind of feature engineering as well
    df['AgeGroup'] = 0
    df.loc[df['Age'] < 16.336, 'AgeGroup' ] = 0
    df.loc[(df['Age'] >= 16.336) & (df['Age'] < 32.252), 'AgeGroup'] = 1
    df.loc[(df['Age'] >= 32.252) & (df['Age'] < 48.168), 'AgeGroup'] = 2
    df.loc[(df['Age'] >= 48.168) & (df['Age'] < 64.084), 'AgeGroup'] = 3
    df.loc[df['Age'] >= 64.084, 'AgeGroup'] = 4

    # Could also create bins for fare, but not sure Fare is needed.
    return df

df = bins(df)

# XGBoost

In [233]:
df_XGB = df.copy()

In [234]:
df_XGB = df_XGB.drop(['Fare', 'Age'], axis=1)

In [235]:
df_XGB['Pclass'] = df_XGB['Pclass'].astype('category')
df_XGB['Embarked'] = df_XGB['Embarked'].astype('category')
df_XGB['AgeGroup'] = df_XGB['AgeGroup'].astype('category')

df_XGB = pd.get_dummies(df_XGB)

df_XGB.head()

Unnamed: 0,Survived,Sex,FamilyMembersCount,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,AgeGroup_0,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4
0,0,1,2,0,0,1,0,0,1,0,1,0,0,0
1,1,0,2,1,0,0,1,0,0,0,0,1,0,0
2,1,0,1,0,0,1,0,0,1,0,1,0,0,0
3,1,0,2,1,0,0,0,0,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0,1,0,0,1,0,0


In [236]:
features = df_XGB.drop('Survived', axis=1)
targets = df_XGB['Survived']
train_X, test_X, train_y, test_y = train_test_split(features, targets, random_state=42)

XGBModel = XGBClassifier(verbosity=0)

XGB_scores = cross_val_score(XGBModel, train_X, train_y)
(XGB_scores.mean(), XGB_scores.std())

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


(0.8069240264841208, 0.030186824765872047)

In [237]:
XGBModel.fit(train_X, train_y)
pred = XGBModel.predict(test_X)
accuracy_score(test_y, pred)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


0.8071748878923767

# XGB TUNING

In [None]:
XGBParam1 = {
    'max_depth': [2,4,6,8,10],
    'min_child_weight': [2,4,6,8]
    }

XGB_Grid1 = GridSearchCV(XGBClassifier(verbosity=0,
                                       gamma=0,
                                       subsample=0.6,
                                       learning_rate=0.1,
                                       n_estimators=200), XGBParam1)
XGB_Grid1.fit(train_X, train_y)
print(XGB_Grid1.best_params_, XGB_Grid1.best_score_)

In [None]:
XGBParam2 = {
    'gamma':[0.0,0.1,0.2,0.3,0.4,0.5]
}

XGB_Grid2 = GridSearchCV(XGBClassifier(verbosity=0,
                                       max_depth=6,
                                       min_child_weight=6,
                                       subsample=0.6,
                                       learning_rate=0.1,
                                       n_estimators=200), XGBParam2)
XGB_Grid2.fit(train_X, train_y)
print(XGB_Grid2.best_params_, XGB_Grid2.best_score_)

In [None]:
XGBParam3 = {
    'subsample':[0.5,0.6,0.7]
}

XGB_Grid3 = GridSearchCV(XGBClassifier(verbosity=0,
                                       max_depth=6,
                                       min_child_weight=6,
                                       gamma=0,
                                       learning_rate=0.04,
                                       n_estimators=200), XGBParam3)
XGB_Grid3.fit(train_X, train_y)
print(XGB_Grid3.best_params_, XGB_Grid3.best_score_)

In [None]:
XGBParam4 = {
    'learning_rate':[0.03,0.04,0.06]
}

XGB_Grid4 = GridSearchCV(XGBClassifier(verbosity=0,
                                       max_depth=6,
                                       min_child_weight=6,
                                       gamma=0,
                                       subsample=0.7,
                                       n_estimators=100), XGBParam4)
XGB_Grid4.fit(train_X, train_y)
print(XGB_Grid4.best_params_, XGB_Grid4.best_score_)

In [None]:
XGBParam5 = {
    'n_estimators':[60, 80, 100]
}

XGB_Grid5 = GridSearchCV(XGBClassifier(verbosity=0,
                                       max_depth=6,
                                       min_child_weight=6,
                                       gamma=0,
                                       subsample=0.6,
                                       learning_rate=0.04), XGBParam5)
XGB_Grid5.fit(train_X, train_y)
print(XGB_Grid5.best_params_, XGB_Grid5.best_score_)

In [None]:
XGBModel = XGBClassifier(verbosity=0,
                         max_depth=6,
                         min_child_weight=6,
                         gamma=0,
                         subsample=0.6,
                         learning_rate=0.04,
                         n_estimators=80)
XGB_scores = cross_val_score(XGBModel, train_X, train_y)
(XGB_scores.mean(), XGB_scores.std())

In [None]:
XGBModel.fit(train_X, train_y)
pred = XGBModel.predict(test_X)
accuracy_score(test_y, pred)

# Random Forrest Classifier

In [243]:
df_RF = df.copy()

In [244]:
df_RF = df_RF.drop(['Fare', 'Age'], axis=1)

In [245]:
df_RF['Pclass'] = df_RF['Pclass'].astype('category')
df_RF['Embarked'] = df_RF['Embarked'].astype('category')
df_RF['AgeGroup'] = df_RF['AgeGroup'].astype('category')

df_RF = pd.get_dummies(df_RF)

df_RF.head()

Unnamed: 0,Survived,Sex,FamilyMembersCount,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,AgeGroup_0,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4
0,0,1,2,0,0,1,0,0,1,0,1,0,0,0
1,1,0,2,1,0,0,1,0,0,0,0,1,0,0
2,1,0,1,0,0,1,0,0,1,0,1,0,0,0
3,1,0,2,1,0,0,0,0,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0,1,0,0,1,0,0


In [246]:
features = df_RF.drop('Survived', axis=1)
targets = df_RF['Survived']
train_X, test_X, train_y, test_y = train_test_split(features, targets, random_state=42)

RFModel = RandomForestClassifier()

RF_scores = cross_val_score(RFModel, train_X, train_y)
(RF_scores.mean(), RF_scores.std())

(0.794927617551341, 0.03818389929357802)

In [247]:
RFModel.fit(train_X, train_y)
pred = RFModel.predict(test_X)
accuracy_score(test_y, pred)

0.8071748878923767

# RF TUNING

In [None]:
RFParam = {
    "max_depth":[4, 5, 6],
    "n_estimators":[50, 100, 200, 300],
    "min_samples_leaf": range(0, 10, 2),
    "max_samples": [0, .1,.2,.3]
}

RF_Grid = GridSearchCV(RandomForestClassifier(), RFParam)
RF_Grid.fit(train_X, train_y)
print(RF_Grid.best_params_, RF_Grid.best_score_)

In [None]:
RFModel = RandomForestClassifier(max_depth=5, max_samples=0.3, min_samples_leaf=2, n_estimators=100)
RF_scores = cross_val_score(RFModel, train_X, train_y)
(RF_scores.mean(), RF_scores.std())

In [None]:
RFModel.fit(train_X, train_y)
pred = RFModel.predict(test_X)
accuracy_score(test_y, pred)

# GaussianNB

In [290]:
df_GNB = df.copy()

In [291]:
df_GNB = df_GNB.drop(['Fare','Age'], axis=1)

In [292]:
# df_GNB['Pclass'] = df_GNB['Pclass'].astype('category')
# df_GNB['Embarked'] = df_GNB['Embarked'].astype('category')
# df_GNB['AgeGroup'] = df_GNB['AgeGroup'].astype('category')

df_GNB = pd.get_dummies(df_GNB)

df_GNB.head()

Unnamed: 0,Survived,Pclass,Sex,FamilyMembersCount,AgeGroup,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,2,1,0,0,1
1,1,1,0,2,2,1,0,0
2,1,3,0,1,1,0,0,1
3,1,1,0,2,2,0,0,1
4,0,3,1,1,2,0,0,1


In [293]:
features = df_GNB.drop('Survived', axis=1)
targets = df_GNB['Survived']
train_X, test_X, train_y, test_y = train_test_split(features, targets, random_state=42)

GNBModel = GaussianNB()

GNB_scores = cross_val_score(GNBModel, train_X, train_y)
(GNB_scores.mean(), GNB_scores.std())

(0.7574570755246325, 0.019572152841336187)

In [294]:
GNBModel.fit(train_X, train_y)
pred = GNBModel.predict(test_X)
accuracy_score(test_y, pred)

0.757847533632287

# GNB TUNING

In [None]:
GNBParam = {
    'var_smoothing': [1e-12, 1e-10, 1e-8, 1e-6, 1e-5, 1e-4, 1e-2]
}

GNB_Grid = GridSearchCV(GaussianNB(), GNBParam)
GNB_Grid.fit(train_X, train_y)
print(GNB_Grid.best_params_, GNB_Grid.best_score_)

In [None]:
GNBModel = GaussianNB(var_smoothing=1e-6)
GNB_scores = cross_val_score(GNBModel, train_X, train_y)
(GNB_scores.mean(), GNB_scores.std())

In [None]:
GNBModel.fit(train_X, train_y)
pred = GNBModel.predict(test_X)
accuracy_score(test_y, pred)

# Support Vector Machine

In [295]:
df_SVM = df.copy()

In [296]:
df_SVM = df_SVM.drop(['Fare', 'AgeGroup'], axis=1)

In [297]:
df_SVM = pd.get_dummies(df_SVM)

df_SVM.head()

Unnamed: 0,Survived,Pclass,Sex,Age,FamilyMembersCount,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,2,0,0,1
1,1,1,0,38.0,2,1,0,0
2,1,3,0,26.0,1,0,0,1
3,1,1,0,35.0,2,0,0,1
4,0,3,1,35.0,1,0,0,1


In [298]:
features = df_SVM.drop('Survived', axis=1)
targets = df_SVM['Survived']

scaler = RobustScaler()
features = scaler.fit_transform(features)

train_X, test_X, train_y, test_y = train_test_split(features, targets, random_state=42)

In [299]:
SVMModel = SVC()

SVM_scores = cross_val_score(SVMModel, train_X, train_y)
(SVM_scores.mean(), SVM_scores.std())

(0.8248344742453148, 0.0330292559029942)

In [300]:
SVMModel.fit(train_X, train_y)
pred = SVMModel.predict(test_X)
accuracy_score(test_y, pred)

0.8251121076233184

# SVM TUNING

In [253]:
SVMParam = {"C":np.arange(1,10,1),
            'gamma':np.arange(0.0125,1, 0.0125)}
SVM_Grid = GridSearchCV(SVC(), SVMParam)
SVM_Grid.fit(train_X, train_y)
print(SVM_Grid.best_params_, SVM_Grid.best_score_)

{'C': 4, 'gamma': 0.0625} 0.8263382336438111


In [301]:
SVMModel = SVC(C=4, gamma=0.0625)

SVM_scores = cross_val_score(SVMModel, train_X, train_y)
(SVM_scores.mean(), SVM_scores.std())

(0.8263382336438111, 0.03185744198042598)

In [302]:
SVMModel.fit(train_X, train_y)
pred = SVMModel.predict(test_X)
accuracy_score(test_y, pred)

0.820627802690583