In [1]:
# load packages

import numpy as np
import pandas as pd
import re
from matplotlib import cm
import seaborn as sns

# import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn import svm

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# model / feature selecting
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectPercentile, chi2

''' ROAD MAP
0. save test data as validation data
1. data exploration
2. write function to clean, prepare all features
3. split train data into train/test
4. test different models using cv
5. tune best model using hyper parameter
6. predict Y with validation data
'''

' ROAD MAP\n0. save test data as validation data\n1. data exploration\n2. write function to clean, prepare all features\n3. split train data into train/test\n4. test different models using cv\n5. tune best model using hyper parameter\n6. predict Y with validation data\n'

## 0. Read Data

In [2]:
#  read csv files
train_df = pd.read_csv('../input/titanic/train.csv')
validation_df = pd.read_csv('../input/titanic/test.csv')

## 1. Data Exploration

In [3]:
# get shape and head of train df
print(f"train_df shape: {train_df.shape}")
train_df.head()

train_df shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# get shape and head of test df
print(f"train_df shape: {validation_df.shape}")
validation_df.head()

train_df shape: (418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# get proportion of survived passengers (train)
print(f"Proportion of survived passengers: {len(train_df.Survived[train_df.Survived==1]) / len(train_df.Survived)*100:.4}%")

Proportion of survived passengers: 38.38%


In [6]:
# count NaNs in train df
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# get number of unique ticket numbers
print(f"Number of unique tickets: {len(train_df.Ticket.unique())}")

Number of unique tickets: 681


In [8]:
# get number of unique cabins
print(f"Number of unique cabins: {len(train_df.Cabin.unique())}")

Number of unique cabins: 148


## 2.1 Data Cleaning and Feature Engineering

In [9]:
def data_cleaning(x):
    '''
    INPUT
    x - pandas data frame
    
    OUTPUT
    None
    
    This function cleans data frame and prepares feature for modeling.
    '''
    
    # replace NaN ages with mean age
    x.Age.fillna(x.Age.mean(), inplace=True)
    
    # replace NaN cabin and embarked with 0
    # cabin remains as string
    x.Embarked.fillna(0, inplace=True)
    x.Cabin.fillna('0', inplace=True)
    
    ''' Feature Engineering '''
    
    # create dummies for sex, drop first=True
    x['Male'] = pd.get_dummies(x.Sex, drop_first=True)

    # get only digits of tickets
    ticket_list = []
    for txt in x.Ticket:
        tlist = [int(s) for s in txt.split() if s.isdigit()]
        if len(tlist) > 0:
            ticket_list.append(tlist[0])
        else:
            ticket_list.append(0)
    x['Ticket_num'] = ticket_list
      
    # extract characters from Cabin as new feature
    Cabin_char = x.Cabin.str.extract(pat="([A-Z])", expand=False)
    # label encode 'Cabin_char'
    le = LabelEncoder()
    Cabin_encode = le.fit_transform(Cabin_char)
    # add encoded chars as new column
    x['Cabin_C'] = Cabin_encode
    
    # extract digits from Cabin
    cabin_digits = x.Cabin.str.extract(pat="(\d+)", expand=False)
    # fill Nan with 0
    cabin_digits.fillna(0, inplace=True)
    # convert to integer as new column
    x['Cabin_digit'] = pd.to_numeric(cabin_digits, downcast='integer')
    
    # creat dummies for Embarked, drop first=True
    x = pd.concat([x, pd.get_dummies(x.Embarked, drop_first=True, prefix='Emb')], axis=1)
    
    # drop unnecessary columns
    x.drop(['Name', 'Sex', 'Ticket', 'Embarked', 'Cabin'], axis=1, inplace=True)
    
    return x

In [10]:
# create new df for train data
clean_train_df = train_df.copy()

# clean and feature engineering
clean_train_df = data_cleaning(clean_train_df)

# show cleaned df
clean_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Male,Ticket_num,Cabin_C,Cabin_digit,Emb_C,Emb_Q,Emb_S
0,1,0,3,22.0,1,0,7.25,1,21171,8,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,0,17599,2,85,1,0,0
2,3,1,3,26.0,0,0,7.925,0,3101282,8,0,0,0,1
3,4,1,1,35.0,1,0,53.1,0,113803,2,123,0,0,1
4,5,0,3,35.0,0,0,8.05,1,373450,8,0,0,0,1


In [11]:
# create new df for validation data
clean_val_df = validation_df.copy()

# clean and feature engineering
clean_val_df = data_cleaning(clean_val_df)

# insert Emb_C column
clean_val_df.insert(10, 'Emb_C', 0)

# show cleaned df
clean_val_df.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Male,Ticket_num,Cabin_C,Cabin_digit,Emb_C,Emb_Q,Emb_S
0,892,3,34.5,0,0,7.8292,1,330911,7,0,0,1,0
1,893,3,47.0,1,0,7.0,0,363272,7,0,0,0,1
2,894,2,62.0,0,0,9.6875,1,240276,7,0,0,1,0
3,895,3,27.0,0,0,8.6625,1,315154,7,0,0,0,1
4,896,3,22.0,1,1,12.2875,0,3101298,7,0,0,0,1


In [12]:
clean_train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Male', 'Ticket_num', 'Cabin_C', 'Cabin_digit', 'Emb_C', 'Emb_Q',
       'Emb_S'],
      dtype='object')

In [13]:
# plot some Features as pair plot
# sns.pairplot(clean_train_df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_C', 'Cabin_digit']], hue="Survived")

## 2.2 Feature Reduction

In [14]:
X = clean_train_df.drop(['PassengerId', 'Survived'], axis=1)
y = clean_train_df.Survived

print(f"X shape before feature selection: {X.shape}")

# feature selection: chi2 test, keep best 70%
sel = SelectPercentile(chi2, percentile=70)
X = sel.fit_transform(X, y)

print(f"X shape after feature selection: {X.shape}")

X shape before feature selection: (891, 12)
X shape after feature selection: (891, 8)


## 3. Split Train Data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## 4. Testing Different Models on all Train Data

In [16]:
# logistic Regression
clf = LogisticRegression(random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()

0.6296277697570772

In [17]:
# Random Forest
clf = RandomForestClassifier(random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores.mean()

0.8350386039796623

In [18]:
# SVC with Standard Scaler
scaler = StandardScaler()
clf = SVC(random_state=42)
scores = cross_val_score(clf, scaler.fit_transform(X), y, cv=5)
scores.mean()

0.7879229175820727

In [19]:
# NuSVC with StandardScaler
scaler = StandardScaler()
clf = NuSVC(gamma='scale', kernel='poly', random_state=42)
scores = cross_val_score(clf, scaler.fit_transform(X), y, cv=5)
scores.mean()

0.7980227229929069

In [20]:
# Linear SVC with StandardScaler
scaler = StandardScaler()
clf = LinearSVC(max_iter=100_000)
scores = cross_val_score(clf, scaler.fit_transform(X), y, cv=5)
scores.mean()

0.7788839369782186

### 4.2 Neuronal Networks

In [21]:
import tensorflow as tf
tf.random.set_seed(42)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [22]:
# reshape y for model
# y_nn = np.reshape(y.values, newshape=(-1, 1))
y_nn = to_categorical(y, num_classes=2)
# new split
X_nn_train, X_nn_test, y_nn_train, y_nn_test = train_test_split(X, y_nn, test_size=0.3)

In [23]:
# create sequential model
model = Sequential()

# input layer
model.add(Dense(units=512, input_dim=X.shape[1]))
model.add(Activation("relu"))
# hidden layer
model.add(Dense(units=256))
model.add(Activation("relu"))
# output layer
model.add(Dense(units=y_nn.shape[1]))
model.add(Activation("sigmoid"))

2022-03-20 17:01:58.810294: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [24]:
# add loss, optimizer and metric(s)
model.compile(
    loss = "binary_crossentropy",
    optimizer = SGD(),
    metrics = ["accuracy"],
)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.001)

In [25]:
# train + validate
model.fit(
    x = X_train,
    y = y_nn_train,
    epochs = 30, # num. of iterations
    validation_data = (X_test, y_nn_test),
    callbacks = [reduce_lr]
)

2022-03-20 17:01:59.152019: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f7bdf4fff10>

## 5. Tune Best Model

In [26]:
clf = RandomForestClassifier(random_state=42)
clf.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [27]:
parameters = {
    'n_estimators': [100, 200, 500],
    'min_samples_leaf': [2, 3],
}

cv = GridSearchCV(estimator=clf, param_grid=parameters, scoring='accuracy', cv=5)
cv.fit(X_train, y_train)

# show results of grid search
cv_df = pd.DataFrame.from_dict(cv.cv_results_)
cv_df[['mean_fit_time', 'params', 'mean_test_score', 'rank_test_score']].sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,params,mean_test_score,rank_test_score
3,0.141935,"{'min_samples_leaf': 3, 'n_estimators': 100}",0.812258,1
4,0.282455,"{'min_samples_leaf': 3, 'n_estimators': 200}",0.810658,2
5,0.699632,"{'min_samples_leaf': 3, 'n_estimators': 500}",0.809032,3
0,0.14524,"{'min_samples_leaf': 2, 'n_estimators': 100}",0.805806,4
2,0.713247,"{'min_samples_leaf': 2, 'n_estimators': 500}",0.804219,5


In [28]:
# get score ('accuracy') on test data with best estimator
cv.score(X_test, y_test)

0.8432835820895522

## 6. Predict on Validation Data

In [29]:
X_val = clean_val_df.drop(['PassengerId'], axis=1)

In [30]:
# clean NaN values
X_val.Fare.fillna(0, inplace=True)
X_val.isna().sum()

Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Male           0
Ticket_num     0
Cabin_C        0
Cabin_digit    0
Emb_C          0
Emb_Q          0
Emb_S          0
dtype: int64

In [31]:
print(f"X shape before feature selection: {X_val.shape}")

X_val = sel.transform(X_val)

print(f"X shape after feature selection: {X_val.shape}")

X shape before feature selection: (418, 12)
X shape after feature selection: (418, 8)


In [32]:
# predict results
y_result = cv.predict(X_val)

In [33]:
# concatenate y_results with passenger ids
id_series = pd.Series(validation_df.PassengerId)
y_series = pd.Series(y_result, name="Survived" )
result = pd.concat([id_series, y_series], axis=1)
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [34]:
# write to csv file
result.to_csv('titanic_prediction.csv', index=False)