In [136]:
# load packages

import numpy as np
import pandas as pd
import re
from matplotlib import cm
import seaborn as sns

# import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn import svm

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import LocalOutlierFactor

# model / feature selecting
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectPercentile, chi2

''' ROAD MAP
0. save test data as validation data
1. data exploration
2. write function to clean, prepare all features
3. split train data into train/test
4. test different models using cv
5. tune best model using hyper parameter
6. train model on all train data, predict Y with validation data
'''

## 0. Read Data

In [137]:
#  read csv files
train_df = pd.read_csv('../input/titanic/train.csv')
validation_df = pd.read_csv('../input/titanic/test.csv')

## 1. Data Exploration

In [138]:
# get shape and head of train df
print(f"train_df shape: {train_df.shape}")
train_df.head()

In [139]:
# get shape and head of test df
print(f"train_df shape: {validation_df.shape}")
validation_df.head()

In [140]:
# get proportion of survived passengers (train)
print(f"Proportion of survived passengers: {len(train_df.Survived[train_df.Survived==1]) / len(train_df.Survived)*100:.4}%")

In [141]:
# count NaNs in train df
train_df.isna().sum()

In [142]:
# get number of unique ticket numbers
print(f"Number of unique tickets: {len(train_df.Ticket.unique())}")

In [143]:
# get number of unique cabins
print(f"Number of unique cabins: {len(train_df.Cabin.unique())}")

## 2.1 Data Cleaning and Feature Engineering

### get metrics of all data

In [144]:
# merge train + validation df
whole_df = train_df.append(validation_df)

In [145]:
# extract titles from names
whole_df['Title'] = whole_df.Name.str.extract(r'([A-Za-z]+)\.', expand=False)
Common_Title = ['Mr', 'Miss', 'Mrs', 'Master']
# group similar titles
whole_df['Title'].replace(['Ms', 'Mlle', 'Mme'], 'Miss', inplace=True)
whole_df['Title'].replace(['Lady'], 'Mrs', inplace=True)
whole_df['Title'].replace(['Sir', 'Rev'], 'Mr', inplace=True)
# group rest to 'others'
whole_df.loc[~whole_df.Title.isin(Common_Title).values, ('Title')] = 'Others'

# calculate median of each title
median_titles = whole_df.groupby('Title')['Age'].median()

In [146]:
# create family ids
whole_df['Surname'] = whole_df.Name.str.extract(r'([A-Za-z]+),', expand=False)
whole_df['TixPref'] = whole_df.Ticket.str.extract(r'(.*\d)', expand=False)
whole_df['SurTix'] = whole_df['Surname'] + whole_df['TixPref']
whole_df['IsFamily'] = whole_df.SurTix.duplicated(keep=False)*1

whole_df['Child'] = whole_df.Age.map(lambda x: 1 if x <=16 else 0)
FamilyWithChild = whole_df[(whole_df.IsFamily==1)&(whole_df.Child==1)]['SurTix'].unique()

whole_df['FamilyId'] = 0
x = 1
for tix in FamilyWithChild:
    whole_df.loc[whole_df.SurTix==tix, ['FamilyId']] = x
    x += 1

# new feature: connected survival
whole_df['ConnectedSurvival'] = 0.5 
Survived_by_FamilyId = whole_df.groupby('FamilyId').Survived.sum()
for i in range(1, len(FamilyWithChild)+1):
    if Survived_by_FamilyId[i] >= 1:
        whole_df.loc[whole_df.FamilyId==i, ['ConnectedSurvival']] = 1
    elif Survived_by_FamilyId[i] == 0:
        whole_df.loc[whole_df.FamilyId==i, ['ConnectedSurvival']] = 0

# drop unused columns
whole_df.drop(['Surname', 'TixPref', 'SurTix', 'IsFamily', 'Child', 'FamilyId'], axis=1, inplace=True)
        
# split into train, validation df
train_df = whole_df[:len(train_df)]
validation_df = whole_df[len(train_df):]

In [147]:
def data_cleaning(x):
    '''
    INPUT
    x - pandas data frame
    
    OUTPUT
    x - pandas data frame
    
    This function cleans data frame and prepares features for modeling.
    '''
    
    # replace NaN cabin and embarked with 0
    # cabin remains as string
    x.Embarked.fillna(0, inplace=True)
    x.Cabin.fillna('0', inplace=True)
    
    # replace Fare NaN with median
    x.Fare.fillna(x.Fare.median(), inplace=True)
    
    ''' Feature Engineering '''   
    
    # create feature 'Title'
    x['Title'] = x.Name.str.extract(r'([A-Za-z]+)\.', expand=False)
    # group similar titles
    x['Title'].replace(['Ms', 'Mlle', 'Mme'], 'Miss', inplace=True)
    x['Title'].replace(['Lady'], 'Mrs', inplace=True)
    x['Title'].replace(['Sir', 'Rev'], 'Mr', inplace=True)
    # group rest to 'others'
    x.loc[~x.Title.isin(Common_Title).values, ('Title')] = 'Others'
    
    # replace NaN ages with median age of title
    # x.Age.fillna(x.Age.mean(), inplace=True)
    for title in median_titles.index:
        # x['Age'][(x.Age.isna()) & (x.Title == title)] = median_titles[title]
        x.loc[(x.Age.isna()) & (x.Title == title), ('Age')] = median_titles[title]
    
    # encode 'Title' to numerical data
    le = LabelEncoder()
    x['Title'] = le.fit_transform(x['Title'])
   
    # create dummies for sex, drop first=True
    x['Male'] = pd.get_dummies(x.Sex, drop_first=True)

    # get only digits of tickets
    ticket_list = []
    for txt in x.Ticket:
        tlist = [int(s) for s in txt.split() if s.isdigit()]
        if len(tlist) > 0:
            ticket_list.append(tlist[0])
        else:
            ticket_list.append(0)
    x['Ticket_num'] = ticket_list
      
    # extract characters from Cabin as new feature
    Cabin_char = x.Cabin.str.extract(pat="([A-Z])", expand=False)
    # label encode 'Cabin_char'
    le = LabelEncoder()
    Cabin_encode = le.fit_transform(Cabin_char)
    # add encoded chars as new column
    x['Cabin_C'] = Cabin_encode
    
    # extract digits from Cabin
    cabin_digits = x.Cabin.str.extract(pat="(\d+)", expand=False)
    # fill Nan with 0
    cabin_digits.fillna(0, inplace=True)
    # convert to integer as new column
    x['Cabin_digit'] = pd.to_numeric(cabin_digits, downcast='integer')
    
    # creat dummies for Embarked, drop first=True
    x = pd.concat([x, pd.get_dummies(x.Embarked, drop_first=True, prefix='Emb')], axis=1)
    
    # drop unnecessary columns
    x.drop(['Name', 'Sex', 'Ticket', 'Embarked', 'Cabin'], axis=1, inplace=True)
    
    return x

In [148]:
# create new df for train data
clean_train_df = train_df.copy()

# clean and feature engineering
clean_train_df = data_cleaning(clean_train_df)

# show cleaned df
clean_train_df.head()

In [149]:
# create new df for validation data
clean_val_df = validation_df.copy()

# clean and feature engineering
clean_val_df = data_cleaning(clean_val_df)

# insert Emb_C column
clean_val_df.insert(13, 'Emb_C', 0)

# show cleaned df
clean_val_df.head()

In [150]:
# Age, Fare bins
# merge data
whole_df = clean_train_df.append(clean_val_df)

# encode all data
le = LabelEncoder()
whole_df['Age_bin5'] = le.fit_transform(pd.qcut(whole_df.Age, 5))
whole_df['Fare_bin5'] = le.fit_transform(pd.qcut(whole_df.Fare, 5))
# drop features
whole_df.drop(['Age', 'Fare'], axis=1, inplace=True)

# split into train, validation df
clean_train_df = whole_df[:len(clean_train_df)]
clean_val_df = whole_df[len(clean_train_df):]

In [151]:
clean_train_df.head()

In [152]:
# drop survived features from validation df
clean_val_df.drop(['Survived'], axis=1, inplace=True)

clean_val_df.head()

In [153]:
# plot some Features as pair plot
sns.pairplot(clean_train_df[['Survived', 'Male', 'Ticket_num', 'ConnectedSurvival', 'Pclass', 'Cabin_digit']], hue="Survived")

## 2.2 Feature Reduction

In [154]:
X = clean_train_df.drop(['PassengerId', 'Survived'], axis=1)
y = clean_train_df.Survived

print(f"X shape before feature selection: {X.shape}")

# feature selection: chi2 test, keep best 70%
sel = SelectPercentile(chi2, percentile=70)
X = pd.DataFrame(sel.fit_transform(X, y), columns=sel.get_feature_names_out(sel.feature_names_in_))

print(f"X shape after feature selection: {X.shape}")

In [155]:
# train data with reduced features
X.head()

## 3. Split Train Data

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## 4. Testing Different Models on all Train Data

In [157]:
# logistic Regression
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

In [158]:
# Random Forest
clf = RandomForestClassifier(random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

#clf.fit(X, y)
#clf.feature_importances_

In [159]:
# Gradient Boost Machine
clf = GradientBoostingClassifier(random_state=42)

scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

In [160]:
# SVC with Standard Scaler
scaler = StandardScaler()
clf = SVC(random_state=42)
scores = cross_val_score(clf, scaler.fit_transform(X_train), y_train, cv=5)
scores.mean()

In [161]:
# NuSVC with StandardScaler
scaler = StandardScaler()
clf = NuSVC(gamma='scale', kernel='poly', random_state=42)
scores = cross_val_score(clf, scaler.fit_transform(X_train), y_train, cv=5)
scores.mean()

In [162]:
# Linear SVC with StandardScaler
scaler = StandardScaler()
clf = LinearSVC(max_iter=100_000)
scores = cross_val_score(clf, scaler.fit_transform(X_train), y_train, cv=5)
scores.mean()

### 4.2 Neuronal Networks

In [163]:
import tensorflow as tf
tf.random.set_seed(42)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [164]:
# reshape y for model
# y_nn = np.reshape(y.values, newshape=(-1, 1))
y_nn = to_categorical(y, num_classes=2)
# new split
X_nn_train, X_nn_test, y_nn_train, y_nn_test = train_test_split(X, y_nn, test_size=0.3)

In [165]:
# create sequential model
model = Sequential()

# input layer
model.add(Dense(units=512, input_dim=X.shape[1]))
model.add(Activation("relu"))
# hidden layer
model.add(Dense(units=256))
model.add(Activation("relu"))
# output layer
model.add(Dense(units=y_nn.shape[1]))
model.add(Activation("sigmoid"))

In [166]:
# add loss, optimizer and metric(s)
model.compile(
    loss = "binary_crossentropy",
    optimizer = SGD(),
    metrics = ["accuracy"],
)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

In [167]:
# train + validate
model.fit(
    x = X_train,
    y = y_nn_train,
    epochs = 30, # num. of iterations
    validation_data = (X_test, y_nn_test),
    callbacks = [reduce_lr]
)

## 5. Tune Best Model

In [168]:
clf = GradientBoostingClassifier(random_state=42)
clf.get_params().keys()

In [178]:
parameters = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [1, 2, 3],
}

cv = GridSearchCV(estimator=clf, param_grid=parameters, scoring='accuracy', cv=5)
cv.fit(X_train, y_train)

# show results of grid search
cv_df = pd.DataFrame.from_dict(cv.cv_results_)
cv_df[['mean_fit_time', 'params', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head()

In [179]:
# get best parameters
cv.best_estimator_

In [180]:
# plot feature importances
feat_imp = cv.best_estimator_.feature_importances_
feat_names = cv.best_estimator_.feature_names_in_

sns.barplot(feat_imp, feat_names)

In [181]:
# get score ('accuracy') on test data with best estimator
cv.score(X_test, y_test)

## 6. Predict on Validation Data

### 6.1 Train best model on all train data

In [173]:
clf = GradientBoostingClassifier(n_estimators=50, random_state=42)
clf.fit(X, y)

### 6.2 Get prediction

In [174]:
# drop unused features
unused_feat = list(set(clean_val_df.columns) - set(X.columns))
X_val = clean_val_df.drop(unused_feat, axis=1)

In [175]:
# predict results
y_result = cv.predict(X_val).astype(int)

In [176]:
# concatenate y_results with passenger ids
id_series = pd.Series(validation_df.PassengerId)
y_series = pd.Series(y_result, name="Survived" )
result = pd.concat([id_series, y_series], axis=1)
result.head()

In [177]:
# write to csv file
result.to_csv('titanic_prediction.csv', index=False)