# Summary #

<div class= 'alert alert-block alert-success'>
<b>The Titanic:</b>

The titanic sunk. Many died.\
Can we <b>predict those who survived</b>?

</div>

# Import packages, load data #

In [None]:
### IMPORT
### GENERAL PACKAGES
import numpy as np, pandas as pd, statsmodels.api as sm
import os

### SETUP
pd.set_option('display.max_columns', None)

### PLOTTING
import seaborn as sns, matplotlib.pyplot as plt

### DATA MANIPULATION
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, TruncatedSVD

### MODELS
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import CategoricalNB, MultinomialNB, GaussianNB, ComplementNB, BernoulliNB

### EXPORTING MODELS
import pickle

### VALIDATION
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay,classification_report

In [None]:
# LOOK FOR FILES
!ls /kaggle/input/titanic/

In [None]:
# LOAD DATA
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

# Exploratory Data Analysis (EDA)

# Inspect data for completeness #

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe(include='all')

<div class= 'alert alert-block alert-danger'>
<b>Missing train data:</b>

* 891 entries, with data types int64, float64, and string
* many entries missing from AGE and, especially, CABIN
</div>

In [None]:
# CHECKING FOR DUPLICATED VALUES
df_train.duplicated().sum()

In [None]:
# CHECK TEST DATA
df_test.info()

In [None]:
df_test.duplicated().sum()

# Trying to figure out _why_ there is missing data

In [None]:
def count_records(featcounts:str, feat:str, norecord:bool):
    """
    Counts FEATCOUNTS occurrences of each category of SEX, PCLASS, EMBARKED, FAMILY=SIBSP+PARCH
    for cases where FEAT={AGE,CABIN} records are absent or not (NORECORD={TRUE, FALSE})
    """
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Sex'] == 'male')]))
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Sex'] == 'female')]))
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Pclass'] == 1)]))
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Pclass'] == 2)]))
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Pclass'] == 3)]))
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Embarked'] == 'S')]))
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Embarked'] == 'C')]))
    featcounts.append(len(df_train[(df_train[feat].isna() == norecord) & (df_train['Embarked'] == 'Q')]))

In [None]:
# COUNTS OF NO AGE OR CABIN RECORD AT EACH PASSENGER CLASS, GENDER, PORT, FAMILY=SIBSP+PARCH
df_records = pd.DataFrame()
df_records['Feature'] = ['Gender(M)', 'Gender(F)', 'Class(1st)','Class(2nd)', 'Class(3rd)', 
                       'Port(Southampton)', 'Port(Cherbourg)', 'Port(Queenstown)']

counts_age = []; counts_noage = []; counts_cabin = []; counts_nocabin = []

count_records(counts_age, 'Age', False)
count_records(counts_noage, 'Age', True)
count_records(counts_cabin, 'Cabin', False)
count_records(counts_nocabin, 'Cabin', True)

df_records['Age_record'] = counts_age
df_records['No_age_record'] = counts_noage
df_records['Cabin_record'] = counts_cabin
df_records['No_cabin_record'] = counts_nocabin

print("""Number of passengers with/without AGE or CABIN records depending on GENDER, ticket CLASS,
port of EMBARKment:\n\n""", df_records)

<div class="alert alert-block alert-info">
<b>Notes on data inspection:</b>

* TICKET data seems useless; some are strings with only numeric values, some are alphanumeric
* missing data could stem from port services, due to a much larger volume of inbound passengers:
    * about 70% of missing AGE/CABIN data related to 3rd class ticket passengers, with a similar percentage for male passengers
    * Southampton was the embarcation port for 72% of all passengers
</div>

<div class= 'alert alert-block alert-warning'>
<b>About train data completeness:</b>

* 891 entries, with data types int64, float64, and string
* 177 values missing from AGE field
* 687 values missing from CABIN field
* 2 values missing from EMBARK field
* no duplicate entries

<b>About test data completeness:</b>

* 418 entries, with data types int64, float64, and string
* test data also has missing values for AGE (327) and CABIN (86), and one missing entry for FARE
* no duplicate entries
</div>

## Univariate Distributions

In [None]:
# CREATE LIST OF FEATURES TO KEEP
select_features = []

In [None]:
# ONE-DIMENSIONAL DISTRIBUTIONS
# SET FIGURE SIZE AND GRID SIZE
fig = plt.figure(figsize = (15,10))
gs = plt.GridSpec(3,3, height_ratios=(1,1,1))

# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0])
g2 = plt.subplot(gs[0, 1])
g3 = plt.subplot(gs[0, 2])
g4 = plt.subplot(gs[1, 0])
g5 = plt.subplot(gs[1, 1])
g6 = plt.subplot(gs[1, 2])
g7 = plt.subplot(gs[2, 0])


ax1 = sns.histplot(ax=g1, data = df_train, x='Pclass',hue = 'Survived', stat='count', multiple='dodge',shrink=3)
ax1.set_xlabel('Passenger class', fontsize=12)
ax1.set_ylabel('Number of passengers', fontsize=12)

ax2 = sns.histplot(ax=g2, data = df_train, x='Sex',hue = 'Survived', stat='count', multiple='dodge', shrink=0.5)
ax2.set_xlabel('Passenger gender', fontsize=12)
ax2.set_ylabel('Number of passengers', fontsize=12)

ax3 = sns.histplot(ax=g3, data = df_train, x='Age',hue = 'Survived', stat='count', multiple='dodge',shrink=0.5)
ax3.set_xlabel('Passenger Age', fontsize=12)
ax3.set_ylabel('Number of passengers', fontsize=12)

ax4 = sns.histplot(ax=g4, data = df_train, x='Parch',hue = 'Survived', stat='count', multiple='dodge')
ax4.set_xlabel('Number of parents/children', fontsize=12)
ax4.set_ylabel('Number of passengers', fontsize=12)

ax5 = sns.histplot(ax=g5, data = df_train, x='SibSp',hue = 'Survived', stat='count', multiple='dodge')
ax5.set_xlabel('Number of siblings/spouses', fontsize=12)
ax5.set_ylabel('Number of passengers', fontsize=12)

ax6 = sns.histplot(ax=g6, data = df_train, x='Embarked',hue = 'Survived', stat='count', multiple='dodge', shrink=0.5)
ax6.set_xlabel('Port (Southampton/Cherbourg/Queenstown)', fontsize=12)
ax6.set_ylabel('Number of passengers', fontsize=12)

ax7 = sns.histplot(ax=g7, data = df_train, x='Fare',hue = 'Survived', stat='count', multiple='dodge', shrink=1)
ax7.set_xlabel('Ticket Fare', fontsize=12)
ax7.set_ylabel('Number of passengers', fontsize=12)

fig.show()

In [None]:
select_features.append('Sex') ; select_features.append('Pclass')

<div class="alert alert-block alert-success">
<b>SEX, PCLASS, EMBARKED, SIBSP, PARCH:</b>

* These features seem to be promissing predictors;
* Sex, in particular, seems to have the most predictive power.
</div>

## Creating the FAMILY features ##

In [None]:
# FAMILY_BIN = PARCH + SIBSP >0 OR NOT
# DEFINE FUNCTION TO ASSIGN FAMILY VALUES BASED ON PARCH AND SIBSP
def fam(parch:int, sibsip:int):
    if (parch==0) & (sibsip==0):
        return 'no'
    else:
        return 'yes'
# APPLY FUNCTION
df_train['Family_yn'] = df_train.apply(lambda x: fam(x.Parch, x['SibSp']), axis=1)
df_test['Family_yn'] = df_test.apply(lambda x: fam(x.Parch, x['SibSp']), axis=1)
# FAMILY = PARCH + SIBSP
df_train['Family'] = df_train['SibSp'] + df_train['Parch']
df_test['Family'] = df_test['SibSp'] + df_test['Parch']
# FAMILY SIZE, ACCORDING TO VARIATION IN SURVIVED FEATURE
def famsize(family):
    if family == 0:
        return 'alone'
    elif family < 4:
        return 'small'
    else:
        return 'large'
# APPLY FUNCTION; pd.Categorical SETS AN ORDINALITY TO THE CATEGORIES THAT WILL BE USED WHEN, E.G., PLOTTING
df_train['Famsize'] = pd.Categorical(df_train['Family'].apply(famsize), categories=['alone', 'small', 'large'])
df_test['Famsize'] = pd.Categorical(df_test['Family'].apply(famsize), categories=['alone', 'small', 'large'])

In [None]:
# PLOT FAMILY FEATURES
fig_fam = plt.figure(figsize = (14,3))
gs = plt.GridSpec(1,3)
# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0])
g2 = plt.subplot(gs[0, 1])
g3 = plt.subplot(gs[0, 2])
# PLOT FAMILY, FAMILY_YN, FAMSIZE
ax_list = ['ax1', 'ax2', 'ax3']
gpts_list = [g1, g2, g3]
feat_list = ['Family', 'Family_yn', 'Famsize']
xlbl_list = ['Number of Family Members Aboard', 'Family Members Aboard: yes/no', 'Family Size (0<small<4)']
ylbl_list = ['P(Family & Survived)', '', '']
for axis, gp, feat, xlabel, ylabel in zip(ax_list, gpts_list, feat_list, xlbl_list, ylbl_list):
    axis = sns.histplot(ax=gp, data = df_train, x=feat,hue = 'Survived', multiple='dodge', stat='probability')
    axis.set_xlabel(xlabel, fontsize=12)
    axis.set_ylabel(ylabel, fontsize=12)
fig_fam.show()

In [None]:
# APPEND FEATURE
select_features.append('Famsize') ; select_features.append('Family_yn')

<div class="alert alert-block alert-success">
<b>FAMSIZE feature:</b>

* Passengers who travel without family members or with large families (more than four family members in total) are much less likely to survive;
* Passengers travelling with up to three other family members (small families) are more likely to survive;
</div>

## Extracting Titles from the NAME feature ##

In [None]:
# WRITE A FUNCTION TO SPLIT THE NAME STRING AND EXTRACT THE TITLE
def detach(inp):
    return list(inp.split(','))[1].split()[0]

# APPLY FUNCTION TO NAME FEATURE
df_train['Title'] = df_train['Name'].apply(lambda x: detach(x))
df_test['Title'] = df_test['Name'].apply(lambda x: detach(x))
# THERE IS A TITLE SPELLED "THE", WHICH BELONGS TO A COUNTESS: INDEX 759
df_train.loc[759,'Title'] = 'Countess'
# GROUP TITLES IN DICTIONARY ACCORDING TO DISTRIBUTION VS SURVIVED
titlegroup = {}
titlegroup.update(dict.fromkeys(['Mr.', 'Miss.', 'Mrs.', 'Ms.'],'Common'))
titlegroup.update(dict.fromkeys(['Master.', 'Dr.', 'Rev.', 'Major.', 'Col.', 'Capt.'],'Professional'))
titlegroup.update(dict.fromkeys(['Don.', 'Sir.', 'Lady.', 'Mlle.', 'Mme.','Jonkheer.', 'Countess'],'Wealth'))
titledictcommon = {}
titledictcommon.update(dict.fromkeys(['Mr.', 'Miss.', 'Mrs.', 'Ms.'],'yes'))
titledictcommon.update(dict.fromkeys(['Master.', 'Dr.', 'Rev.', 'Major.', 'Col.', 'Capt.', 'Don.', 'Sir.', 'Lady.', 'Mlle.', 'Mme.','Jonkheer.', 'Countess'],'no'))
# GROUP TITLES
df_train['Titlegroup'] = df_train['Title'].map(titlegroup)
df_test['Titlegroup'] = df_test['Title'].map(titlegroup)
# SIMPLIFY TITLES
df_train['Titlecommon'] = df_train['Title'].map(titledictcommon)
df_test['Titlecommon'] = df_test['Title'].map(titledictcommon)

In [None]:
# PLOT TITLE FEATURES
fig_ttl = plt.figure(figsize = (14,3))
gs = plt.GridSpec(1,3)
# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0])
g2 = plt.subplot(gs[0, 1])
g3 = plt.subplot(gs[0, 2])
# PLOT TITLE, TITLEGROUP, TITLESIMPLE
ax_list = ['ax1', 'ax2', 'ax3']
gpts_list = [g1, g2, g3]
feat_list = ['Title', 'Titlegroup', 'Titlecommon']
xlbl_list = ['Title', 'Title, grouped', 'Title = Common']
ylbl_list = ['P(Title & Survived)', 'P(Title & Survived)', 'P(Title & Survived)']
for axis, gp, feat, xlabel, ylabel in zip(ax_list, gpts_list, feat_list, xlbl_list, ylbl_list):
    axis = sns.histplot(ax=gp, data = df_train, x=feat,hue = 'Survived', multiple='dodge', stat='probability')
    axis.set_xlabel(xlabel, fontsize=12)
    axis.set_ylabel(ylabel, fontsize=12)
    axis.tick_params(axis='x', rotation = 45)
    fig_ttl.show()

In [None]:
# APPEND FEATURE
select_features.append('Titlecommon')

<div class="alert alert-block alert-success">
<b>TITLESIMPLE feature:</b>

* Passengers with TITLECOMMON=yes are less likely to survive;
* Passengers with TITLECOMMON=no are as likely to survive as not;
</div>

## Filling the FARE field in test data ##

In [None]:
# PASSENGERS WITHOUT FARE DATA IN TEST DATASET
df_test[df_test['Fare'].isna() == True]

In [None]:
# CALCULATE MEDIAN OF FARE FOR RELEVANT PASSENGER FEATURES IN THE TRAIN DATASET AND ASSIGN IT
# 3rd PCLASS, MALE, NO FAMILY, SOUTHAMPTON
med_fare = df_train[(df_train['Pclass'] == 3) & (df_train['Sex'] == 'male') & (df_train['Family_yn'] == 'no') & (df_train['Embarked'] == 'S')]['Fare'].median()
df_test['Fare'] = df_test['Fare'].fillna(med_fare)

## Binning FARE into Quartiles ##

In [None]:
# BINNING FARE VALUES INTO QUARTILES
def quarts(fare):
    q1 = df_train['Fare'].quantile(0.25)
    q2 = df_train['Fare'].quantile(0.50)
    q3 = df_train['Fare'].quantile(0.75)
    if fare < q1:
        return 1
    elif fare < q2:
        return 2
    elif fare < q3:
        return 3
    elif fare >= q3:
        return 4

df_train['Fare_Q'] = df_train.apply(lambda x: quarts(x['Fare']), axis=1)
df_test['Fare_Q'] = df_test.apply(lambda x: quarts(x['Fare']), axis=1)

In [None]:
# PLOT FARE FEATURES
fig_fare = plt.figure(figsize = (14,3))
gs = plt.GridSpec(1,3)
# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0])
g2 = plt.subplot(gs[0, 1])
# PLOT FARE, FARE_Q, FARE_Q4
ax_list = ['ax1', 'ax2']
gpts_list = [g1, g2]
feat_list = ['Fare', 'Fare_Q']
xlbl_list = ['Fare', 'Fare Quartiles']
ylbl_list = ['P(Fare & Survived)', '']
for axis, gp, feat, xlabel, ylabel in zip(ax_list, gpts_list, feat_list, xlbl_list, ylbl_list):
    axis = sns.histplot(ax=gp, data = df_train, x=feat,hue = 'Survived', multiple='dodge', stat='probability')
    axis.set_xlabel(xlabel, fontsize=12)
    axis.set_ylabel(ylabel, fontsize=12)
fig_fare.show()

In [None]:
# APPEND FEATURE
select_features.append('Fare_Q')

<div class="alert alert-block alert-success">
<b>FARE_Q4 feature:</b>

* Passengers with FARE_Q4=no are less likely to survive;
* Passengers with FARE_Q4=yes are more likely to survive;
</div>

## Filling the EMBARKED field ##

In [None]:
# THE TWO MISSING EMBARKED VALUES IN THE TRAIN DATA ARE FOR FIRST CLASS FEMALES WITHOUT FAMILY ABOARD
# SHARING A CABIN, SO THE PORT COULD BE THE SAME
df_train[df_train['Embarked'].isna() == True]

In [None]:
# LOOKING AT RECORDS FROM FEMALES, WITHOUT FAMILY, FIRST CLASS
print(df_train[(df_train['Pclass'] == 1) & (df_train['Family_yn'] == 'no') & (df_train['Sex'] == 'female')].groupby('Embarked')['PassengerId'].count())
# OVER 50% PROBABILITY THAT BOTH CAME FROM CHERBOURG, SO WE'LL ASSIGN THAT AS EMBARKED PORT
df_train['Embarked'].fillna('C', inplace=True)

In [None]:
# GROUPING EMBARKED INTO SOUTHAMPTON=Yes/No, ACCORDING TO DISTRIBUTION OF EMBARKED AND SURVIVED
def portsouth(port):
    if port == 'S':
        return 'yes'
    else:
        return 'no'
df_train['Port_SH'] = pd.Categorical(df_train['Embarked'].apply(portsouth), categories = ['no', 'yes'])
df_test['Port_SH'] = pd.Categorical(df_test['Embarked'].apply(portsouth), categories = ['no', 'yes'])

In [None]:
# PLOT EMBARKED FEATURES
fig_port = plt.figure(figsize = (10,3))
gs = plt.GridSpec(1,2)
# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0])
g2 = plt.subplot(gs[0, 1])
# PLOT EMBARKED, PORT_SH
ax_list = ['ax1', 'ax2']
gpts_list = [g1, g2]
feat_list = ['Embarked', 'Port_SH']
xlbl_list = ['Port of Embarkment', 'Port = Southampton']
ylbl_list = ['P(Port & Survived)', '']
for axis, gp, feat, xlabel, ylabel in zip(ax_list, gpts_list, feat_list, xlbl_list, ylbl_list):
    axis = sns.histplot(ax=gp, data = df_train, x=feat,hue = 'Survived', multiple='dodge', stat='probability')
    axis.set_xlabel(xlabel, fontsize=12)
    axis.set_ylabel(ylabel, fontsize=12)
fig_port.show()

In [None]:
# APPEND FEATURE
select_features.append('Embarked')

<div class="alert alert-block alert-success">
<b>PORT_SH feature:</b>

* Passengers embarking in Southampton are less likely to survive;
* Passengers embarking in other ports are as likely to survive as not;
</div>

## Creating the CabinId feature ##

In [None]:
# COUNTS OF CABIN IDENTIFIERS INCLUDING ONLY DIGITS (NOT LETTERS)
df_train['Cabin'].str.isnumeric().value_counts()

In [None]:
# GET CABIN IDENTIFIER LETTER(S)
# THERE ARE TOO MANY CABINID CATEGORIES (EX: B, BB, BBB, BBBB), SO WE'LL KEEP ONLY THE FIRST LETTER
# THIS MEANS, FROM 1ST TO 3RD CLASS: T, A, B, C, D, E, F, G
def firstlet(cabin:str):
    if pd.isna(cabin)==False:
        return str(cabin)[0]
# APPLY FUNCITON
df_train['CabinId'] = pd.Categorical(df_train['Cabin'].apply(firstlet), categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'])
df_test['CabinId'] = pd.Categorical(df_test['Cabin'].apply(firstlet), categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'])

In [None]:
df_train.groupby(['Pclass', 'CabinId']).PassengerId.count()

<div class="alert alert-block alert-danger">
ENTRY 339 WAS PERMANENTLY DELETED!
</div>

In [None]:
# CBN_COMP=T IS A SINGLE ENTRY FOR THE CABIN, AND MAY RESULT IN PROBLEMS LATER ON
# WE WILL ELIMINATE THIS ENTRY PERMANENTLY (INDEX 339)
df_train = df_train.drop(index=339).reset_index(drop=True)

<div class="alert alert-block alert-danger">
ENTRY 339 WAS PERMANENTLY DELETED!
</div>

## Filling the CABINID field ##

In [None]:
# PEARSON CORRELATION BETWEEN CABINID AND OTHER FEATURES
corr = pd.get_dummies(df_train.drop(['PassengerId', 'Name', 'Ticket', 'Survived', 'Cabin'], axis=1)).corr()
# GET ROW AND COLUMN NUMBERS FOR -0.4 < PEARSON CORRELATION < 0.4 (USING ONLY LOWER HALF OF CORRELATION MATRIX)
row,col =  np.nonzero(abs(np.triu(corr.iloc[-7:,:-7]))>0.3)
# GET FEATURES NAME FROM COLUMN INDEXES
colname = []
for i,j in enumerate(corr.iloc[-8:,:-8].columns):
    if i in col:
        colname.append(j)
print(colname)

In [None]:
# CREATE FUNCTION TO READ FEATURES WITH HIGHEST CORRELATION WITH CABINID (FARE AND FARE_Q4, THE OTHERS ARE REDUNDANT),
# FOR EACH CABINID=NaN ENTRY, EXTRACT ALL CABINID VALUES FOR SIMILAR FEATURES, 
# AND ASSIGN A RANDOMLY SAMPLED CABINRS VALUE
def sampler(fare:float, fare_q:str, cabinid:str):
    if pd.isnull(cabinid) == True:
        df = df_train[df_train['CabinId'].isna() == False]
        cbn_list = list(df[(df['Fare'] == fare) & (df['Fare_Q'] == fare_q)]['CabinId'])
        if len(cbn_list) != 0:
            return np.random.choice(cbn_list)
        else:
            # MOST COMMON CABINID VALUE IS C
            return 'C'
    else:
        return cabinid
# RUN FUNCTION OVER CABINID
df_train['CabinRS'] = pd.Categorical(df_train.apply(lambda x: sampler(x['Fare'], x['Fare_Q'], x['CabinId']), axis=1), categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'O'])
# FOR TEST DATA, USING THE SAME FUNCTION AS FOR THE TRAIN DATA
df_test['CabinRS'] = pd.Categorical(df_test.apply(lambda x: sampler(x['Fare'], x['Fare_Q'], x['CabinId']), axis=1), categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'O'])

<div class="alert alert-block alert-info">
<b>Note on CABINID:</b>

* There are no features having a Pearson correlation coefficients larger than 0.4 with CABINID;
    * This was likely the cause of the failure in modeling CABINID with regressions, trees, and Naive Bayes (another notebook)
* The random sampling strategy will be tested, but likely abandoned;
* CABINID values will be imputed as the mode value of all entries.
</div>

In [None]:
# FINDING MOST COMMON CABINID VALUES
print(df_train['CabinId'].value_counts())
# FILL NaN VALUES IN CABINID WITH CABINID=C
df_train['CabinC'] = df_train['CabinId'].fillna('C')
df_test['CabinC'] = df_test['CabinId'].fillna('C')

In [None]:
# PLOT CABINID FEATURES
fig_cbn = plt.figure(figsize = (14,7))
gs = plt.GridSpec(2,3, height_ratios=(1,1))
# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0])
g2 = plt.subplot(gs[0, 1])
g3 = plt.subplot(gs[0, 2])
g4 = plt.subplot(gs[1, 0])
g5 = plt.subplot(gs[1, 1])
# PLOT CABINID, CABINRS, CABINC
ax_list = ['ax1', 'ax2', 'ax3']
gpts_list = [g1, g2, g3]
feat_list = ['CabinId', 'CabinRS', 'CabinC']
xlbl_list = ['Cabin (original)', 'Cabin (fill=random)', 'Cabin (fill=mode)']
ylbl_list = ['P(Cabin & Survived)', '', '']
for axis, gp, feat, xlabel, ylabel in zip(ax_list, gpts_list, feat_list, xlbl_list, ylbl_list):
    axis = sns.histplot(ax=gp, data = df_train, x=feat,hue = 'Survived', multiple='dodge', stat='probability')
    axis.set_xlabel(xlabel, fontsize=12)
    axis.set_ylabel(ylabel, fontsize=12)
    axis.set_ylim(0,0.6)
# PLOT COMPARISON OF ORIGINAL DISTRIBUTION AND DISTRIBUTIONS WITH ASSIGNED VALUES
ax4 = sns.histplot(ax=g4, data=df_train, x='CabinId', stat='probability', color='tab:green', label='Original')
ax4 = sns.histplot(ax=g4, data=df_train, x='CabinRS', stat='probability', color='tab:brown', label='Sampled', alpha=0.4)
ax4.legend()
ax4.set_xlabel('Cabin', fontsize=12)
ax4.set_ylabel('P(Cabin)', fontsize=12)
ax4.set_ylim(0,0.9)
ax5 = sns.histplot(ax=g5, data=df_train, x='CabinId', stat='probability', color='tab:green', label='Original')
ax5 = sns.histplot(ax=g5, data=df_train, x='CabinC', stat='probability', color='tab:red', label='Mode=C', alpha=0.4)
ax5.legend()
ax5.set_xlabel('Cabin', fontsize=12)
ax5.set_ylabel('P(Cabin)', fontsize=12)
ax5.set_ylim(0,0.9)
fig_cbn.show()

In [None]:
# ASSIGNING NaN VALUES IN CABIN ID TO THE CATEGORY O(THER)
df_train['CabinIdO'] = pd.Categorical(df_train['CabinId'], categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'O']).fillna('O')
df_test['CabinIdO'] = pd.Categorical(df_test['CabinId'], categories = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'O']).fillna('O')
# PLOT CABINID FEATURES
fig_cbno = plt.figure(figsize = (12,3))
# PLOT CABINID, CABINIDO
g1 = plt.subplot(1,2,1)
sns.histplot(ax=g1, data=df_train, x='CabinId', stat='probability', color='tab:green', label='Original')
sns.histplot(ax=g1, data=df_train, x='CabinIdO', stat='probability', color='tab:brown', label='NaN=O(ther)', alpha=0.4)
g1.legend()
g1.set_xlabel('Cabin', fontsize=12)
g1.set_ylabel('P(Cabin)', fontsize=12)
g1.set_ylim(0,0.9)
g2 = plt.subplot(1,2,2)
sns.histplot(ax=g2, data=df_train, x='CabinIdO', stat='probability', hue='Survived', multiple='dodge')
g2.set_xlabel('Cabin', fontsize=12)
g2.set_ylabel('P(Cabin)', fontsize=12)
g2.set_ylim(0,0.9)
fig_cbno.show()

In [None]:
# CREATING BINARY FEATURE FOR CABINS WITH LARGER ODDS OF SURVIVING
def cabsurv(cbn):
    if cbn in ['B', 'C', 'D', 'E', 'F']:
        return 1
    else:
        return 0
df_train['cabinsurvival'] = df_train['CabinIdO'].apply(cabsurv)
df_test['cabinsurvival'] = df_test['CabinIdO'].apply(cabsurv)
# APPEND FEATURE
select_features.append('cabinsurvival')

<div class="alert alert-block alert-success">
<b>Note on CABINID AND CABINIDO:</b>

* Both random sampling and mode replacement assignment strategies decrease the original distributions' spread, heavily emphasising the occurrence of label C (as expected) while also reversing the probability ratio of surviving/not_surviving for cabin C;
    * this means the most populated CABINID feature now reports the opposite trend in survival probability;
* Missing CABINID entries will be instead assigned the value O(ther); this at least preserves the original distributions and has a large not_survive/survive ratio.
</div>

## Filling the AGE field ##

In [None]:
# PEARSON CORRELATION BETWEEN CABINID AND OTHER FEATURES
corr = pd.get_dummies(df_train.drop(['PassengerId', 'Name', 'Ticket', 'Survived', 'Cabin', 'CabinId', 'CabinRS', 'CabinC'], axis=1)).corr()
# SORTED CORRELATION VALUES
print('Ten highest correlated features with age:\n', pd.concat([corr.iloc[1,:].sort_values().head(5), corr.iloc[1,:].sort_values().tail(5)]))
# KEEP TITLE, PCLASS, SIBSP, FAMILY
# FOR EACH AGE=NaN ENTRY, EXTRACT ALL AGE VALUES FOR SIMILAR FEATURES, AND ASSIGN A RANDOMLY SAMPLED AGE VALUE
def sampler(pclass:int, family:str, sibsp:str, title:str, age:int):
    if pd.isnull(age) == True:
        df = df_train[df_train['Age'].isna() == False]
        age_list = list(df[(df['Pclass'] == pclass) & (df['Family'] == family) & (df['SibSp'] == sibsp) & (df['Title'] == title)]['Age'])
        if len(age_list) != 0:
            return np.random.choice(age_list)
        else:
            # MOST COMMON AGE VALUE IS 22.0
            return 22.0
    else:
        return age
# RUN FUNCTION OVER AGE
df_train['Age_RS'] = df_train.apply(lambda x: sampler(x['Pclass'], x['Family'], x['SibSp'], x['Title'], x['Age']), axis=1)
# FOR TEST DATA, USING THE SAME FUNCTION AS FOR THE TRAIN DATA
df_test['Age_RS'] = df_test.apply(lambda x: sampler(x['Pclass'], x['Family'], x['SibSp'], x['Title'], x['Age']), axis=1)

# FILL NaN VALUES IN AGE WITH MEDIAN AGE VALUE
agemed = df_train['Age'].quantile(0.5)
df_train['Agemed'] = df_train['Age'].fillna(agemed)
df_test['Agemed'] = df_test['Age'].fillna(agemed)
# BINNING AGE_RS INTO PERCENTILES
q1 = df_train['Age_RS'].quantile(0.25)
q2 = df_train['Age_RS'].quantile(0.50)
q3 = df_train['Age_RS'].quantile(0.75)

def ageq(age):
    if age<q1:
        return 1
    elif age<q2:
        return 2
    elif age<q3:
        return 3
    else:
        return 4

# IMPUTE AGE QUANTILES
df_train['Age_Q'] = df_train['Age_RS'].apply(ageq)
df_test['Age_Q'] = df_test['Age_RS'].apply(ageq)

In [None]:
# PLOT AGE FEATURES
fig_age = plt.figure(figsize = (14,7))
gs = plt.GridSpec(2,3)
# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0])
g2 = plt.subplot(gs[0, 1])
g3 = plt.subplot(gs[0, 2])
g4 = plt.subplot(gs[1, 0])
g5 = plt.subplot(gs[1, 1])
g6 = plt.subplot(gs[1, 2])
# PLOT AGE, AGE_RS, AGEMED, AGE_Q
ax_list = ['ax1', 'ax2', 'ax3', 'ax6']
gpts_list = [g1, g2, g3, g6]
feat_list = ['Age', 'Age_RS', 'Agemed', 'Age_Q']
xlbl_list = ['Age (Original)', 'Age (Sampled)', 'Age (Median=22)', 'Age Percentiles']
ylbl_list = ['P(Cabin & Survived)', '', '', '']
for axis, gp, feat, xlabel, ylabel in zip(ax_list, gpts_list, feat_list, xlbl_list, ylbl_list):
    axis = sns.histplot(ax=gp, data = df_train, x=feat,hue = 'Survived', multiple='dodge', stat='probability')
    axis.set_xlabel(xlabel, fontsize=12)
    axis.set_ylabel(ylabel, fontsize=12)
# PLOT COMPARISON OF ORIGINAL DISTRIBUTION AND DISTRIBUTIONS WITH ASSIGNED VALUES
ax4 = sns.histplot(ax=g4, data=df_train, x='Age', stat='probability', color='tab:green', label='Original', binwidth=0.6)
ax4 = sns.histplot(ax=g4, data=df_train, x='Age_RS', stat='probability', color='tab:brown', label='Sampled', binwidth=0.6, alpha=0.4)
ax4.legend()
ax4.set_xlabel('Age', fontsize=12)
ax4.set_ylabel('P(Age)', fontsize=12)
ax5 = sns.histplot(ax=g5, data=df_train, x='Age', stat='probability', color='tab:green', label='Original', binwidth=0.6)
ax5 = sns.histplot(ax=g5, data=df_train, x='Agemed', stat='probability', color='tab:red', label='Median=22', binwidth=0.6, alpha=0.4)
ax5.legend()
ax5.set_xlabel('Age', fontsize=12)
ax5.set_ylabel('', fontsize=12)
fig_age.show()

In [None]:
# CREATE THE CHILD FEATURE ACCORDING TO AGE DISTRIBUTION FOR SURVIVED=YES
def child(age):
    if age<=5:
        return 'yes'
    else:
        return 'no'
df_train['Child'] = df_train['Age_RS'].apply(child)
df_test['Child'] = df_test['Age_RS'].apply(child)

In [None]:
# PLOT AGE FEATURES
fig_age = plt.figure(figsize = (14,3))
gs = plt.GridSpec(1,3)
# SET GRID POSITIONS FOR PLOTS
g1 = plt.subplot(gs[0, 0:2])
g2 = plt.subplot(gs[0, 2])
# PLOT AGERS
ax1 = sns.histplot(ax=g1, data=df_train, x='Age_RS', stat='probability', hue='Survived', multiple='dodge')
ax1.set_xlabel('Age', fontsize=12)
ax1.set_ylabel('', fontsize=12)
ax2 = sns.histplot(ax=g2, data=df_train, x='Child', stat='probability', hue='Survived', multiple='dodge')
ax2.set_xlabel('Child', fontsize=12)
ax2.set_ylabel('', fontsize=12)
fig_age.show()

In [None]:
# APPEND FEATURE
select_features.append('Age_Q'), select_features.append('Child')

<div class="alert alert-block alert-success">
<b>AGE_RS feature:</b>

* AGE_RS seems to mimick the original AGE distribution quite well;
* Children under 5 have a higher probability to survive.
</div>

<div class="alert alert-block alert-info">
<b>About lack of AGE and CABIN records:</b>

* AGE/CABIN records, both missing and existing, follow SEX, PCLASS, EMBARK, PARCH+SIBSP records
    * this is related to the large proportion of SEX=male, PCLASS=3, EMBARK=S, and PARCH+SIBSP=0
* there are sufficient AGE data that missing AGE records can be estimated
    * regression model
    * random sampling from distribution
    * NaiveBayes
    * ultimately, modeling didn't work well - the predictions were poor, so we went with random sampling instead
* for CABIN data, there are many more missing data points
    * all CABIN data start with one or more alphabet characters, from which CABINID was built
    * CABINID varies the most with PCLASS
    * CABINID could be estimated
        * also, modeling didn't work well here
        * no type of assignment worked - we're dropping CABIN information
</div>

## Bivariate distributions

In [None]:
# SURVIVABILITY, FARE_Q AND FAMSIZE
fig_ = plt.figure(figsize = (12,3))
g = sns.catplot(data =df_train[['Famsize', 'Fare_Q', 'Survived', 'PassengerId']].groupby(['Famsize', 'Fare_Q', 'Survived']).count().reset_index(),
            x='Fare_Q', y='PassengerId', hue= 'Survived', col='Famsize', kind='bar')
for ax in g.axes.flatten():
    ax.set_xlabel('Fare_Q', fontsize=12)
    ax.set_ylabel('Number of Passengers', fontsize=12)
fig.show()

In [None]:
# NUMBER OF SURVIVED=1 OR SURVIVED=0 PASSENGERS BY FAMSIZE AND FARE_Q
df_live=pd.DataFrame(df_train.loc[df_train['Survived']==1, ['Famsize', 'Fare_Q', 'Survived']].groupby(['Famsize', 'Fare_Q']).count()).reset_index()
df_die=pd.DataFrame(df_train.loc[df_train['Survived']==0, ['Famsize', 'Fare_Q', 'Survived']].groupby(['Famsize', 'Fare_Q']).count()).reset_index()
# CALCULATE RATIO OF SURVIVED TO DEAD
df_LD = df_live.merge(df_die, on=['Famsize', 'Fare_Q'], how='left')
def ratio_ld(live,die):
    if die != 0:
        return (live/die)
    else:
        return 0    
df_LD['Ratio_LD'] = df_LD.apply(lambda x: ratio_ld(x['Survived_x'], x['Survived_y']), axis=1)
# ASSIGN THE CALCULATED RATIO TO COMBINATIONS OF FAMSIZE & FARE_Q
def famfare(fam, fare):
        return df_LD.loc[(df_LD['Famsize']==fam) & (df_LD['Fare_Q']==fare), 'Ratio_LD'].iloc[0]
df_train['famfare'] = df_train.apply(lambda x: famfare(x['Famsize'], x['Fare_Q']), axis=1)
df_test['famfare'] = df_test.apply(lambda x: famfare(x['Famsize'], x['Fare_Q']), axis=1)
del df_LD, df_live, df_die
sns.histplot(data=df_train, x='famfare', hue='Survived', stat='probability', multiple='dodge')
# APPEND FEATURE
select_features.append('famfare')

In [None]:
# SURVIVABILITY, SEX AND PCLASS
fig_ = plt.figure(figsize = (12,3))
g = sns.catplot(data =df_train[['Sex', 'Pclass', 'Survived', 'PassengerId']].groupby(['Sex', 'Pclass', 'Survived']).count().reset_index(),
            x='Pclass', y='PassengerId', hue= 'Survived', col='Sex', kind='bar')
for ax in g.axes.flatten():
    ax.set_xlabel('Pclass', fontsize=12)
    ax.set_ylabel('Number of Passengers', fontsize=12)
fig.show()

In [None]:
# NUMBER OF SURVIVED=1 OR SURVIVED=0 PASSENGERS BY SEX AND PCLASS
df_live=pd.DataFrame((df_train.loc[df_train['Survived']==1, ['Sex', 'Pclass', 'Survived']].groupby(['Sex', 'Pclass']).count())).reset_index()
df_die=pd.DataFrame((df_train.loc[df_train['Survived']==0, ['Sex', 'Pclass', 'Survived']].groupby(['Sex', 'Pclass']).count())).reset_index()
# CALCULATE RATIO OF SURVIVED TO DEAD
df_LD = df_live.merge(df_die, on=['Sex', 'Pclass'], how='left')
def ratio_ld(live,die):
    if die != 0:
        return (live/die)//10
    else:
        return 0    
df_LD['Ratio_LD'] = df_LD.apply(lambda x: ratio_ld(x['Survived_x'], x['Survived_y']), axis=1)
# ASSIGN THE CALCULATED RATIO TO COMBINATIONS OF FAMSIZE & FARE_Q
def sexclass(sex, pclass):
        return df_LD.loc[(df_LD['Sex']==sex) & (df_LD['Pclass']==pclass), 'Ratio_LD'].iloc[0]
df_train['sexclass'] = df_train.apply(lambda x: sexclass(x['Sex'], x['Pclass']), axis=1)
df_test['sexclass'] = df_test.apply(lambda x: sexclass(x['Sex'], x['Pclass']), axis=1)
del df_LD, df_live, df_die
sns.histplot(data=df_train, x='sexclass', hue='Survived', stat='probability', multiple='dodge')
# APPEND FEATURE
select_features.append('sexclass')

In [None]:
print(select_features)

# Modeling #

<div class="alert alert-block alert-info">
<b>Modeling strategy:</b>
    
* We will calibrate a series of models, to be used collectively for the final prediction;
</div>

## Set-up variables, convert and scale data ##

In [None]:
print(select_features)

In [None]:
# CREATE DATAFRAMES WITH SELECTED FEATURES
df_train_select=df_train[select_features].copy()
df_test_select=df_test[select_features].copy()
# SELECT FEATURES FROM EDA
df_train_select = df_train_select.drop(['Sex', 'Pclass'],axis=1)
df_test_select = df_test_select.drop(['Sex', 'Pclass'],axis=1)
# CONVERT CATEGORICAL DATA FOR LOGISTIC REGRESSION CLASSIFIER WITH ONE-HOT ENCODING
# DEFINE PREDICTOR AND OUTCOME VARIABLES
Xdum = pd.get_dummies(df_train_select, drop_first=False)
Xdum_test = pd.get_dummies(df_test_select, drop_first=False)
y = df_train['Survived']

# SELECT TOP 20 FEATURES BASED ON CHI^2 TEST
feat_index = SelectKBest(chi2, k=20).fit(Xdum,y).get_support(indices=True)
Xncd = Xdum.iloc[:,feat_index]
Xncd_test = Xdum_test.iloc[:,feat_index]

# PLOT PEARSON CORRELATION BETWEEN FEATURES
corr = Xncd.corr()
uppertri = np.triu(corr)
fig = plt.figure(figsize=(12,8))
ax1=sns.heatmap(corr, annot=True, cmap='Reds', mask=uppertri)
fig.show()

In [None]:
# GET ROW AND COLUMN NUMBERS FOR LARGE PEARSON CORRELATION (BEYOND +/-0.75)
row,col =  np.nonzero((abs(np.tril(corr))>0.75))
# REMOVE REPEATED ENTRIES, E.G. 1,1 AND 2,2
lrow = [int(i) if i!=j else '' for i,j in zip(row,col)]
# REMOVE COLUMNS FROM Xncd
colname = []
for i,j in enumerate(corr.columns):
    if i in lrow:
        colname.append(j)
Xncd = Xncd.drop(colname, axis=1)
Xncd_test = Xncd_test.drop(colname, axis=1)        

In [None]:
# SPLIT DATA INTO TRAINING AND TRIAL
X_train, X_trial, y_train, y_trial = train_test_split(Xncd, y, test_size=0.25, stratify=y, random_state=0)

In [None]:
# FUNCTION TO RUN AND PLOT RESULTS OF MODEL_PREDICT
def model_preds(model_name, model_type, data_trial=X_trial, data_test=Xncd_test):
    """
    Given a model_name (e.g. model_lg, model_rf, etc) and type (logreg, tree, vector, nb), 
    it will return the model predictions (e.g. preds_lg, preds_lg_test) from the 
    trial and test data (X_trial, and Xncd_test as defaults), a classification report,
    a confusion matrix, and a feature importance/coefficients/excess odds plot for the model.
    """
    # CHECK FOR CORRECT MODEL_TYPE
    if model_type not in ['logreg', 'tree', 'vector', 'nb']:
        print('ERROR: model_type must be set as either tree, logreg, vector or nb.')
        return
    # MAKE PREDICTIONS ON TRIAL AND TEST DATA
    preds = model_name.predict(data_trial)
    preds_test = model_name.predict(data_test)    
    # CLASSIFICATION REPORT
    labels = ['Predicted not to survive', 'Predicted to survive']
    clfrep = classification_report(y_trial, preds, target_names=labels)
    print(clfrep)
    # PLOT CONFUSION MATRIX
    fig_cm = plt.figure(figsize=(2,2))
    cm = confusion_matrix(y_trial, preds, labels = model_name.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=model_name.classes_)
    disp.plot(values_format='')
    fig_cm.show()
    # PLOT FEATURE IMPORTANCE, ODDS, COEFFICIENTS
    if model_type == 'tree':
        # FEATURE IMPORTANCE TREE-BASED MODELS
        fig_fi = plt.figure(figsize=(12,3))
        importance = pd.DataFrame()
        importance['Feature'] = data_trial.columns
        importance['Importance'] = model_name.named_steps['model_tree'].feature_importances_
        importance = importance.sort_values(by='Importance', ascending=False)
        ax = sns.barplot(data=importance, x='Feature', y='Importance')
        ax.set_xlabel('Feature', fontsize=16)
        ax.set_ylabel('Feature Importance', fontsize=16)
        plt.xticks(rotation = 45)
        fig_fi.show()
    elif model_type == 'logreg':
        # ODDS FROM LOGISTIC REGRESSION
        fig_odd = plt.figure(figsize = (12,3))
        odds = pd.DataFrame()
        odds['Feature'] = data_trial.columns
        odds['Odds'] = (np.exp(model_name.named_steps['model_lg'].coef_.flatten()))
        odds = odds.sort_values(by = 'Odds', ascending=False, key=abs)
        ax=sns.barplot(data=odds, x='Feature', y='Odds')
        ax.set_xlabel('Feature', fontsize=16)
        ax.set_ylabel('Odds (Survival)', fontsize=16)
        plt.xticks(rotation = 45)        
        fig_odd.show()
    elif model_type == 'vector':
        # VECTOR MAGNITUDES
        fig_vec = plt.figure(figsize = (12,3))
        mags = pd.DataFrame()
        mags['Feature'] = data_trial.columns
        mags['Components'] = np.exp(model_name.named_steps['model_svc'].coef_.flatten())
        mags = mags.sort_values(by = 'Components', ascending=False)
        ax=sns.barplot(data = mags, x='Feature', y='Components')
        ax.set_xlabel('Feature', fontsize=16)
        ax.set_ylabel('Vector Components', fontsize=16)
        plt.xticks(rotation = 45)
        fig_vec.show()
    # RETURN PREDICTIONS
    return preds, preds_test

## Fit model: Logistic Regression ##

In [None]:
%%time
# SETUP MODEL, CROSS-VALIDATION
# PIPELINE FOR CV - THIS IS SCALING THE DATASET FOR EVERY CV FOLD, 
# EVEN THE ONES USING THE SAME DATA FOR TRAINING/TESTING, WHICH IS VEEERY INNEFICIENT...
pl_lg = Pipeline([('scale', StandardScaler()),
                ('model_lg', LogisticRegressionCV(max_iter=1000, tol=0.0001, random_state=0))])
# CV PARAMETERS
par_grid_lg = {'model_lg__Cs' : [50, 100, 200, 250]}
#CROSS-VALIDATION
# FIT MODEL
model_grid_lg = GridSearchCV(pl_lg, param_grid=par_grid_lg, cv=5, refit='f1', n_jobs = -1, verbose = 1)
model_grid_lg.fit(X_train, y_train)
# BEST PREDICTOR
best_lg = model_grid_lg.best_estimator_
print('Best CV parameters: ', model_grid_lg.best_params_)
# GET PREDICTIONS
preds_lg, preds_test_lg = model_preds(best_lg, 'logreg')

## Fit model: Random Forest ##

## Fit model: XGB ##

In [None]:
%%time
# SETUP MODEL, CROSS-VALIDATION
# PIPELINE FOR CV - THIS IS SCALING THE DATASET FOR EVERY CV FOLD, 
# EVEN THE ONES USING THE SAME DATA FOR TRAINING/TESTING, WHICH IS VEEERY INNEFICIENT...
# ALSO, SCALING ISN'T NECESSARY FOR TREES, BUT WE'RE BUILDING AN ENSEMBLE MODEL AT THE END,
# SO ALL THE DATA SHOULD BE EITHER SCALED OR NOT
pl_xgb = Pipeline([('scale', StandardScaler()),
                ('model_tree', XGBClassifier(objective = 'binary:logistic', random_state=0))])
# CV PARAMETERS
par_grid_xgb = {'model_tree__max_depth' : [3, 4, 5],
                'model_tree__n_estimators' : [150, 200, 250],
                'model_tree__learning_rate' : [0.1, 0.2, 0.3],
                'model_tree__colsample_bytree' : [0.7, 0.8, 0.9],
                'model_tree__min_child_weight': [7, 10, 13],
                'model_tree__gamma' : [0.7, 0.8, 0.9],
                'model_tree__subsample' : [0.5, 0.7, 0.9]}
#CROSS-VALIDATION
# FIT MODEL
model_grid_xgb = GridSearchCV(pl_xgb, param_grid=par_grid_xgb, cv=5, refit='f1', n_jobs = -1, verbose = 1)
model_grid_xgb.fit(X_train, y_train)
# BEST PREDICTOR
best_xgb = model_grid_xgb.best_estimator_
print('Best CV parameters: ', model_grid_xgb.best_params_)
# GET PREDICTIONS
preds_xgb, preds_test_xgb = model_preds(best_xgb, 'tree')

## Fit model: (Linear) Support Vector Classifier ##

## Fit model: Bernoulli Naive Bayes Classifier ##

## Fit model: Gaussian Naive Bayes Classifier ##

## Stacking Classifier model ##