In [4]:
import os
import pandas as pd
cwd= os.getcwd() 
path = os.path.join(cwd,'data')

def get_train_set():
    print ('loading provided train set.')
    fp = os.path.join(path,'train_titanic.csv')
    df_train = pd.read_csv(fp, encoding='ISO-8859-1',low_memory=False)
    df_train['Sex'] = df_train['Sex'].map({'male': 1, 'female': 0})
    df_train['Sex'] = df_train['Sex'].astype(int)
    return df_train


df_train= get_train_set ()
df_train.head()

loading provided train set.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [2]:
def get_test_set():    
    print ('loading provided test set .')
    fp = os.path.join(path,'test_titanic.csv')
    df_test = pd.read_csv(fp, encoding='ISO-8859-1',low_memory=False)
    
    return df_test

df_test= get_test_set()
df_test['Sex'] = df_test['Sex'].map({'male': 1, 'female': 0})

df_test['Sex'] = df_test['Sex'].astype(int)

df_test.head()

loading provided test set .


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S


In [3]:
def remove_missing_survival_info(df):
    print('Removing samples with missing survival information.')
    print('Length before cleaning= {:,}'.format(len(df)))
    df = df.dropna(subset=['Sex'])
    print('Length after cleaning= {:,}'.format(len(df)))
    return df

df_train = remove_missing_survival_info(df_train)


Removing samples with missing survival information.
Length before cleaning= 891
Length after cleaning= 891


In [4]:
def get_X_train_test(df_train, df_test): 
    print('Extracting features.')
    df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
    df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1
    cols_to_keep = [
        'Pclass',
        'Sex',
        'Age',
        'FamilySize'
    ]

    X_train = df_train[cols_to_keep]
    X_test = df_test[cols_to_keep]
    return X_train, X_test, cols_to_keep


X_train, X_test, features  = get_X_train_test(df_train, df_test)

y_train = df_train['Survived']

X_train.shape

Extracting features.


(891, 4)

In [5]:
print("X_train:")
print(X_train.head())

print("\nX_test:")
print(X_test.head())

print("\nFeatures:")
print(features)


X_train:
   Pclass  Sex   Age  FamilySize
0       3    1  22.0           2
1       1    0  38.0           2
2       3    0  26.0           1
3       1    0  35.0           2
4       3    1  35.0           1

X_test:
   Pclass  Sex   Age  FamilySize
0       3    1  34.5           1
1       3    0  47.0           2
2       2    1  62.0           1
3       3    1  27.0           1
4       3    0  22.0           3

Features:
['Pclass', 'Sex', 'Age', 'FamilySize']


In [6]:
print(X_train.dtypes)


Pclass          int64
Sex             int32
Age           float64
FamilySize      int64
dtype: object


In [31]:
from sklearn.preprocessing import LabelEncoder
def label_encoder(X_train,X_test):    
    print ('label encoding.')
    X_train = X_train.copy()
    X_test = X_test.copy() 

    for col in columns_to_encode:
        le = LabelEncoder().fit(X_train[col].astype(str)) 
        X_train[col] = le.transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
    return X_train,X_test

X_train,X_test= label_encoder(X_train,X_test)
X_train.dtypes


label encoding.


Pclass          int64
Sex             int32
Age           float64
FamilySize      int64
dtype: object

In [8]:
X_train.isna().any()

Pclass        False
Sex           False
Age            True
FamilySize    False
dtype: bool

In [9]:
def fill_na(df):
    print ('filling NaN...')
    for col in list(df):
        if df[col].isna().any():
            df[col]= df[col].fillna(0)

fill_na(X_train)
fill_na(X_test) 
X_train.isna().any().any(), X_test.isna().any().any()

filling NaN...
filling NaN...


(False, False)

In [10]:
X_train.isna().any()

Pclass        False
Sex           False
Age           False
FamilySize    False
dtype: bool

In [11]:
from sklearn.preprocessing import MinMaxScaler
def normalize(X_train, X_test):
    print ('normalizing.')
    scaler= MinMaxScaler()
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled 

X_train_scaled, X_test_scaled  = normalize(X_train, X_test)
type(X_train_scaled)

normalizing.


numpy.ndarray

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier 
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
X_train_scaled, X_test_scaled = normalize(X_train, X_test)


y_test = y_train[:len(y_train)//5] 

X_test_scaled = X_train_scaled[:len(y_train)//5, :]

def run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train, y_test= None):
    print ('Running GridSearchCV.')
    grid_clf = GridSearchCV(clf, param_grid=grid_values,scoring='f1')
    grid_clf.fit(X_train_scaled, y_train)
    print('Grid best parameter (max f1 ): ', grid_clf.best_params_) 
    print('Grid best score (f1): ', grid_clf.best_score_) 

    if not y_test is None:
        test_score= grid_clf.score(X_test_scaled, y_test)
        print("test f1= {}".format(test_score))
        
def run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test=None, list_classifiers=None):
    if list_classifiers is None or 'LogisticRegression' in list_classifiers:
        print('\nLogisticRegression.')
        clf = LogisticRegression(max_iter=10000)
        grid_values = {'C': [0.005, 0.01, 0.1, 1, 100, 10000, 100000]}
        run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test)

    if list_classifiers is None or 'DecisionTreeClassifier' in list_classifiers:
        print('\nDecisionTreeClassifier')
        clf = DecisionTreeClassifier()       
        grid_values = {'max_depth': [2, 5, 7, 20, 50]}
        run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test)   

    if list_classifiers is None or 'RandomForestClassifier' in list_classifiers:
        print('\nRandomForestClassifier.')
        clf = RandomForestClassifier()       
        grid_values = {'n_estimators': [20, 50]}  
        run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test)   

    if list_classifiers is None or 'SVC' in list_classifiers:
        print('\nSVC')
        clf = SVC()  
        grid_values = {'C': [0.005, 0.01]}  
        run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test)   

    if list_classifiers is None or 'NB' in list_classifiers:
        print('\nNB')
        clf = GaussianNB().fit(X_train_scaled, y_train)
        train_f1 = f1_score(y_train, clf.predict(X_train_scaled))
        print("train set f1= {}".format(train_f1))
        if not y_test is None:
            test_f1 = f1_score(y_test, clf.predict(X_test_scaled))
            print("train set f1= {}".format(test_f1))

    if list_classifiers is None or 'GradientBoostingClassifier' in list_classifiers:
        print('\nGradientBoostingClassifier.')
        clf = GradientBoostingClassifier()         
        grid_values = {'max_depth': [3, 5, 7]}
        run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test)   

    if list_classifiers is None or 'MLP' in list_classifiers:
        print('\nMLP.')
        clf = MLPClassifier(hidden_layer_sizes=[50])  
        grid_values = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
        run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test)   

    if list_classifiers is None or 'XGB' in list_classifiers:
        print('\nXGB.')
        clf = XGBClassifier().fit(X_train_scaled, y_train)
        y_predicted = clf.predict(X_test_scaled)
        print('f1_score  = {:.2}'.format(f1_score(y_test, y_predicted)))

# Usage
list_classifiers = ['RandomForestClassifier', 'LogisticRegression', 'DecisionTreeClassifier', 'SVC', 'NB', 'GradientBoostingClassifier', 'XGB']
run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test)

normalizing.

LogisticRegression.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'C': 0.1}
Grid best score (f1):  0.7092684925689026
test f1= 0.7272727272727273

DecisionTreeClassifier
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 50}
Grid best score (f1):  0.7497965890496477
test f1= 0.8828828828828829

RandomForestClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'n_estimators': 20}
Grid best score (f1):  0.7486465126884179
test f1= 0.8695652173913044

SVC
Running GridSearchCV.
Grid best parameter (max f1 ):  {'C': 0.01}
Grid best score (f1):  0.6261557023715565
test f1= 0.7241379310344828

NB
train set f1= 0.7325227963525835
train set f1= 0.6935483870967741

GradientBoostingClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 5}
Grid best score (f1):  0.7664158686540038
test f1= 0.8672566371681415

MLP.
Running GridSearchCV.




Grid best parameter (max f1 ):  {'alpha': 0.1}
Grid best score (f1):  0.7303046283309959
test f1= 0.7207207207207208

XGB.
f1_score  = 0.83


In [17]:
def apply_preprocessing(df_train, df_test):

    df_train = remove_missing_survival_info(df_train)
    y_train = df_train['Survived']

    if 'Survived' in list(df_test):
        df_test = remove_missing_survival_info(df_test)
        y_test = df_test['Survived']
    else:
        y_test = None 



    X_train, X_test, features = get_X_train_test(df_train, df_test)

    X_train, X_test = label_encoder(X_train, X_test)

    fill_na(X_train)
    fill_na(X_test)

    X_train_scaled, X_test_scaled = normalize(X_train, X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test, features


In [32]:
from sklearn.model_selection import train_test_split

df = get_train_set()
df_train, df_test = train_test_split(df, random_state=0)
X_train_scaled, X_test_scaled, y_train, y_test, features = apply_preprocessing(df_train, df_test)

print('X_train_scaled shape= {}\nX_test_scaled shape= {}'.format(X_train_scaled.shape, X_test_scaled.shape))
print('y_train set shape= {}\ny_test set shape= {}'.format(y_train.shape, y_test.shape))


loading provided train set.
Removing samples with missing survival information.
Length before cleaning= 668
Length after cleaning= 668
Removing samples with missing survival information.
Length before cleaning= 223
Length after cleaning= 223
Extracting features.
label encoding.
filling NaN...
filling NaN...
normalizing.
X_train_scaled shape= (668, 4)
X_test_scaled shape= (223, 4)
y_train set shape= (668,)
y_test set shape= (223,)


In [39]:
def run_all_classifiers(X_train, X_test, y_train, y_test, list_classifiers):
    for classifier_name in list_classifiers:
        if classifier_name == 'LogisticRegression':
            print ('\nLogisticRegression.')
            clf = LogisticRegression()
            grid_values = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
            run_GridSearchCV(clf, grid_values, X_train, X_test, y_train, y_test)

        elif classifier_name == 'DecisionTreeClassifier':
            print ('\nDecisionTreeClassifier')
            clf = DecisionTreeClassifier()
            grid_values = {'max_depth': [5, 10, 15, 20, 25, 30]}
            run_GridSearchCV(clf, grid_values, X_train, X_test, y_train, y_test)

        elif classifier_name == 'RandomForestClassifier':
            print ('\nRandomForestClassifier.')
            clf = RandomForestClassifier()
            grid_values = {'n_estimators': [10, 20, 30, 40, 50]}
            run_GridSearchCV(clf, grid_values, X_train, X_test, y_train, y_test)

        elif classifier_name == 'NB':
            print ('\nNB')
            clf = GaussianNB()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_train)
            train_score = f1_score(y_train, y_pred)
            print("Naive Bayes (NB):")
            print("Train F1 Score:", train_score)
            print("\n")

        elif classifier_name == 'GradientBoostingClassifier':
            print ('\nGradientBoostingClassifier.')
            clf = GradientBoostingClassifier()
            grid_values = {'max_depth': [3, 5, 7, 9, 11]}
            run_GridSearchCV(clf, grid_values, X_train, X_test, y_train, y_test)

        elif classifier_name == 'MLP':
            print ('\nMLP.')
            clf = MLPClassifier()
            grid_values = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]}
            run_GridSearchCV(clf, grid_values, X_train, X_test, y_train, y_test)

        elif classifier_name == 'xgboost':
            print ('\nxgboost.')
            clf = XGBClassifier()
            grid_values = {'n_estimators': [50, 100, 150, 200, 250]}
            run_GridSearchCV(clf, grid_values, X_train, X_test, y_train, y_test)

list_classifiers = [
    'LogisticRegression',
    'DecisionTreeClassifier',
    'RandomForestClassifier',
    'NB',
    'GradientBoostingClassifier',
    'MLP',
    'xgboost',
]

run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test, list_classifiers)


LogisticRegression.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'C': 0.1}
Grid best score (f1):  0.7109181282099429
test f1= 0.7065868263473053

DecisionTreeClassifier
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 5}
Grid best score (f1):  0.7122180584184633
test f1= 0.6878980891719746

RandomForestClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'n_estimators': 10}
Grid best score (f1):  0.7486218516955113
test f1= 0.710843373493976

NB
Naive Bayes (NB):
Train F1 Score: 0.7306122448979592



GradientBoostingClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 3}
Grid best score (f1):  0.7508988747762366
test f1= 0.7547169811320756

MLP.
Running GridSearchCV.




Grid best parameter (max f1 ):  {'alpha': 0.01}
Grid best score (f1):  0.7266009922156101
test f1= 0.7058823529411765

xgboost.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'n_estimators': 150}
Grid best score (f1):  0.7308338790341553
test f1= 0.7349397590361447


In [41]:
import numpy as np
best_params = {'max_depth': 7}

clf = GradientBoostingClassifier(**best_params)

f1_train = cross_val_score(clf, X_train_scaled, y_train, cv=5, scoring='f1')
print("Cross-validated F1 scores on the whole set:", f1_train)

clf.fit(X_train_scaled, y_train)

y_predict = clf.predict(X_test_scaled)

print("Mean of predictions:", np.mean(y_predict))

Cross-validated F1 scores on the whole set: [0.70707071 0.75925926 0.75       0.71578947 0.77083333]
Mean of predictions: 0.3632286995515695
