In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
import lightgbm as lgb
import xgboost as xgb


In [3]:
#Import csv

df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
#Looking for null values
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# Name, ticket and Passenger id are basically useless 
# ( we _could_ infere something from the titles included in names, 
# but it's a story for another day)
# Cabin column (majority of null values) will be dropped, 
# as will be the 2 rows with missing embarked,
# but Age should be imputed (filling with mean?)

df['Age'] = df['Age'].fillna(df['Age'].mean())
df = df.drop(['Cabin', 'Name', 'PassengerId','Ticket'], axis=1)
df = df.dropna(axis=0)

In [7]:
df.isna().sum()
#Well done guys / Bravo à tous

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [8]:
# beside fare and age, all others variables are categorical and should be encoded
df= pd.get_dummies(data=df, columns=['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked'] )
df.describe()
df.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,False,False,True,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
1,1,38.0,71.2833,True,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,False
2,1,26.0,7.925,False,False,True,True,False,False,False,...,False,False,False,False,False,True,False,False,False,True
3,1,35.0,53.1,True,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,True
4,0,35.0,8.05,False,False,True,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True


In [9]:
# scaling data
colonnes = df.columns
scaler = MinMaxScaler()
df[colonnes] = scaler.fit_transform(df)
df

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.0,0.271174,0.014151,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.472229,0.139136,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.321438,0.015469,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.434531,0.103644,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.434531,0.015713,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.334004,0.025374,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
887,1.0,0.233476,0.058556,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
888,0.0,0.367921,0.045771,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
889,1.0,0.321438,0.058556,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [10]:
# train _ test splitting
x_train, x_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), df['Survived'])

In [11]:
# Code Factorisation
x_train, x_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), df['Survived'])

dico_model = { 
    GradientBoostingClassifier : {'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.005],
    'max_depth': [3, 5, 8]},
    lgb.LGBMClassifier : {'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.005],
    'max_depth': [3, 5, 8]},
    xgb.XGBClassifier : { 'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.005],
    'max_depth': [3, 5, 8]},
    LogisticRegression : {'max_iter' : [50, 100 , 200 , 500],
    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']},
    SVC : {'kernel' : ['linear', 'poly','rbf', 'sigmoid']},
    Perceptron : {'max_iter' : [100, 200, 500, 1000],
    'eta0' : [0.5, 1, 1.5]}
}

def model_comp(x_train, x_test, y_train, y_test, modeldictionnary):
    
    #preformatting outputs
    output = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    estimators= dict()
    #Extracting model
    for i, (model, param) in enumerate(modeldictionnary.items()):
        #declaring model 
        mod = model()

        #grid
        grid = GridSearchCV(mod, param, cv = 5, verbose= 2 , n_jobs=-1, refit= False)
        grid.fit(x_train, y_train)

        #Extracting Best Parameters
        best_params = grid.best_params_
        #fitting model with best parameters
        best_mod = model(**best_params)
        best_mod.fit(x_train, y_train)
        y_hat= best_mod.predict(x_test)
        #computing metrics
        accuracy = accuracy_score(y_test, y_hat)
        precision = precision_score(y_test, y_hat)
        recall = recall_score(y_test, y_hat)
        f1= f1_score(y_test, y_hat)
        metrics = [accuracy, precision, recall, f1]
        #output format
        output.loc[mod.__class__.__name__] = metrics
        #saving best estimators in a dict
        estimators[mod.__class__.__name__] = best_mod

    return output.style.highlight_max(color='darkgrey'), estimators
        
sortie_df, best_estimators = model_comp(x_train,x_test, y_train, y_test, dico_model)
sortie_df

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
GradientBoostingClassifier,0.811659,0.857143,0.62069,0.72
LGBMClassifier,0.789238,0.916667,0.505747,0.651852
XGBClassifier,0.820628,0.84058,0.666667,0.74359
LogisticRegression,0.775785,0.746667,0.643678,0.691358
SVC,0.775785,0.824561,0.54023,0.652778
Perceptron,0.704036,0.610526,0.666667,0.637363
