In [30]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.base import clone

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


# Problem Definition & Goal

The task is to predict if a passenger can survive the disaster. Since there are only 2 possible outcomes: alive and dead, I believe this is a classification problem, or more specifically, a binary classification one.      

# Exploratory Data Analysis & Data Cleaning

First, let's load the dataset and import all necessary modules

In [31]:
train = pd.read_csv('../input/titanic/train.csv')
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [57]:
test = pd.read_csv('../input/titanic/test.csv')
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


Some general statistics to have an overview of the dataset:

In [32]:
train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Mannion, Miss. Margareth",male,,,,347082.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [58]:
test.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Carlsson, Mr. Carl Robert",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,39.0,1.0,0.0,,31.5,,


Some insights observed from the stats table above:  
- There are 3 different classes of ticket on the Titanic, but the mean of the train set is above 2.3, which means the number of First class passengers is far lower than that of those in Second and Third class. Fortunately, this is also the case in the test set, which means the train-test splitting was done pretty well, so we do not need to care about skewed train set.
- Column PassengerId is just a sequence of number to distinguish between each row, so this can be dropped without affecting the model performance.
- Columns with categorical values such as Pclass, Sex, and Embarked can be applied with one hot encoding to ensure that the model does not mistake them with continuous values


Some other additional observations:  
- It is not trivial to impute missing values in column Cabin as there are plenty of such instances and the correlations between this column and other features avaiable in our datasets are not that explicit. One possible way to tackle this would be to combine this dataset with some other dataset which has passenger names and the cabin they were in. Another the way is to find which cabins each Pclass were assigned in, then fill in the missing values in column Cabin based on our available data on ticket class. 
- There are only two gender values present in this dataset, under the format of string "male" and "female". We can use a one-hot encoder for this.

In [33]:
train.Sex.unique()

array(['male', 'female'], dtype=object)

Another interesting information is that the number of ticket (681) is fewer than the number of passengers (891). This is supposedly because the children used the same tickets with their adult companions?

In [34]:
len(train.Ticket.unique())

681

In [35]:
np.nan in train.Ticket.unique()

False

# Handling missing data

## Place of embarkation

There are two missing values in column Embarked, so let's have a look at them to see if we can find a way to handle this

In [36]:
train[ train.Embarked.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


When attempting to find information about these two people, I came across this [link](https://www.encyclopedia-titanica.org/titanic-survivor/amelia-icard.html), which is quite useful. From this, we can conclude that these people embarked from Southampton, so these 2 values should be imputed with "S"

In [37]:
train_cleaned = train.copy()
train_cleaned.at[train_cleaned.Embarked.isna(), "Embarked"] = "S"

## Age

There are almost 200 data points with missing age values. There are 3 possible ways which I suppose can be used to handle this problem:  
- Replace all of them with the mean value: this is probably the easiest way to go, but since the standard deviation is quite large, this might not be the most ideal option.
- Replace with the median
- Use a regression model to impute those missing values

For now I will use the first method because of its simplicity, then if there is enough time I will attempt with the third option.

In [38]:
train_cleaned.at[train_cleaned.Age.isna(), "Age"] = train_cleaned.Age.mean()

Let's see what we have gotten so far:

In [39]:
train_cleaned.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891.0,204,891
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Mannion, Miss. Margareth",male,,,,347082.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,646
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,13.002015,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,22.0,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,29.699118,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,31.0,,


Now that the dataset is much cleaner now with only one column with missing data, we can move on to prepare our data and train models

# Data Preparation

First, we can start with encoding columns with categorical data types, which are Pclass, Embarked and Sex.

## Pclass encoding

In [40]:
train_prepared = train_cleaned.copy()
pclass_onehot_encoder = OneHotEncoder()
pclass_onehot_encode = pclass_onehot_encoder.fit_transform(train_prepared[['Pclass']]).toarray()

In [41]:
for cat in pclass_onehot_encoder.categories_[0]:
    train_prepared.insert(2, 'Pclass_'+str(cat), pclass_onehot_encode[:,cat-1])

## Embarked encoding

Next, we also need to encode the Embarked column:

In [42]:
embarked_1hot_encoder = OneHotEncoder()
embarked_1hot_encode = embarked_1hot_encoder.fit_transform(train_prepared[['Embarked']]).toarray()

In [43]:
for cat in embarked_1hot_encoder.categories_[0]:
    insert_index = len(train_prepared.columns)-1
    value_index = np.where(embarked_1hot_encoder.categories_[0] == cat)[0][0]
    train_prepared.insert(insert_index, "Embarked_"+cat, embarked_1hot_encode[:, value_index])

## Sex encoding

And last but not least, the sex of each passenger:

In [44]:
sex_1hot_encoder = OneHotEncoder()
sex_1hot_encode = sex_1hot_encoder.fit_transform(train_prepared[["Sex"]]).toarray()

In [45]:
for cat in sex_1hot_encoder.categories_[0]:
    insert_index = len(train_prepared.columns)-1
    value_index = np.where(sex_1hot_encoder.categories_[0] == cat)[0][0]
    train_prepared.insert(insert_index, "Sex_"+cat, sex_1hot_encode[:, value_index])

Now, we can drop all the unnecessary columns and ones which takes long time or more effort to clean, then we can start training some models and see what works best in this case.

In [46]:
drop_columns = ['PassengerId', 'Name', 'Ticket','Cabin', 'Pclass', 'Sex', 'Embarked']
train_prepared = train_prepared.drop(drop_columns, axis=1)
train_prepared

Unnamed: 0,Survived,Pclass_3,Pclass_2,Pclass_1,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,0,1.0,0.0,0.0,22.000000,1,0,7.2500,0.0,0.0,1.0,0.0,1.0
1,1,0.0,0.0,1.0,38.000000,1,0,71.2833,1.0,0.0,0.0,1.0,0.0
2,1,1.0,0.0,0.0,26.000000,0,0,7.9250,0.0,0.0,1.0,1.0,0.0
3,1,0.0,0.0,1.0,35.000000,1,0,53.1000,0.0,0.0,1.0,1.0,0.0
4,0,1.0,0.0,0.0,35.000000,0,0,8.0500,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0.0,1.0,0.0,27.000000,0,0,13.0000,0.0,0.0,1.0,0.0,1.0
887,1,0.0,0.0,1.0,19.000000,0,0,30.0000,0.0,0.0,1.0,1.0,0.0
888,0,1.0,0.0,0.0,29.699118,1,2,23.4500,0.0,0.0,1.0,1.0,0.0
889,1,0.0,0.0,1.0,26.000000,0,0,30.0000,1.0,0.0,0.0,0.0,1.0


## Split predictors and target

In [47]:
X_train = train_prepared.drop('Survived', axis=1)
y_train = train_prepared.Survived

# Model selection and training

Now that we have our data cleaned, encoded into numerical values and split into predictors and target, it's time to train our classification models. I will start with a single Decision Tree

In [48]:
tree_grid_search_params = {
    'criterion': ['gini','entropy'],
    'max_depth': [3, 4, None],
}
tree_grid_search = GridSearchCV(DecisionTreeClassifier(), tree_grid_search_params, scoring='accuracy', cv=4, return_train_score=True)
tree_grid_search.fit(X_train, y_train)
tree_grid_search.best_score_

0.8193198804185352

The result is not that impressive, but hopefully a collection of trees can do better. To save some training time, I will take advantage of the parameters of the best tree:

In [49]:
tree_grid_search.best_params_

{'criterion': 'entropy', 'max_depth': 3}

In [50]:
rfr_grid_search_params = {
    'n_estimators': [10, 50, 100],
    'criterion': ['entropy'],
    'max_depth': np.arange(3,20)
}
rfr_grid_search = GridSearchCV(RandomForestClassifier(), rfr_grid_search_params, scoring='accuracy', cv=4, return_train_score=True)
rfr_grid_search.fit(X_train, y_train)
rfr_grid_search.best_score_

0.8339039712358098

In [51]:
rfr_grid_search.best_params_

{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 50}

As can be seen, the performance is slightly improved, but not that great. Let's try with the K Neighbors algorithms to see how it does: 

In [52]:
knn_clf = KNeighborsClassifier()
knn_scores = cross_val_score(knn_clf, X_train, y_train, scoring='accuracy')
knn_scores.mean()

0.7037474107086812

So it performs even worse. One of the possible reasons for why these models fall short of my expectations is that all the features are being used. Maybe if only the ones with high correlation is chosen, our models may perform better. Let's find out:

In [53]:
for f in sorted(zip(X_train.columns, rfr_grid_search.best_estimator_.feature_importances_),key=lambda x: x[1]):
    print(f)

('Embarked_Q', 0.009104966811795201)
('Embarked_C', 0.01376986882301593)
('Embarked_S', 0.017351584834613312)
('Pclass_2', 0.030923310352367395)
('Parch', 0.044292743768548686)
('Pclass_1', 0.044825994146182685)
('SibSp', 0.054221402278952696)
('Pclass_3', 0.062183070027481246)
('Sex_male', 0.15976383823778695)
('Age', 0.1665879523748859)
('Fare', 0.19783860102161563)
('Sex_female', 0.19913666732275442)


For the sake of experimentality, I will only train the model with the top 5 most important features and see if that has any considerable impact on the model performance in general. Once again, I will use the parameters of the best estimators:

In [54]:
X_train_selected = X_train[['Sex_male', 'Fare', 'Age', 'Pclass_3']]

In [55]:
rfr_clf2 = RandomForestClassifier(criterion='entropy', max_depth=8)
rfr_clf2_scores = cross_val_score(rfr_clf2, X_train_selected, y_train, cv=5, scoring='accuracy')
rfr_clf2_scores.mean()

0.8305316678174629

In [56]:
knn_clf2 = KNeighborsClassifier()
knn_clf2_scores = cross_val_score(knn_clf2, X_train_selected, y_train, cv=5, scoring='accuracy')
knn_clf2_scores.mean()

0.6891657774151027

The performance did not seem to improve that much with a smaller set of features, but I think it was worth trying. I also reran the above code several times and in some cases, the accuracy of our Random Forest Classifier was slightly improved with only the 5 most important features, but in general they are roughly the same. The K Neighbors classifier performs even worse for some reasons. Unfortunately, the column Survived was nowhere to be found in the test set, so the above results are only on the train set. 