## Imports

In [33]:
import pandas as pd
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
import numpy as np


## Read Data

In [34]:
data_df = pd.read_csv("titanic_dataset.csv")
data_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### Drop PassengerID  
This feature does not contain any valuable information

In [35]:
data_df = data_df.drop(['PassengerId'], axis=1)
data_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### Drop Ticket
This feature will most likely not contain any valuable information since almost every passanger has a unique ticket

In [36]:
data_df = data_df.drop(['Ticket'], axis=1)
data_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C148,C


#### Missing Data  
From data exploration we learnt that:
* Feature `Cabin` is missing 687/891 values
* Feature `Embarked` is missing 2/891 values
* Feature `Age` is missing 177/891 values

##### Cabin Feature
The cabin feature has the structure of "C123" where the letter ("C") is the deck and the number ("123") is the cabin number. The letter can be extracted as a new feature and missing values can be its own category of this feature.

In [37]:
import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

data_df['Cabin'] = data_df['Cabin'].fillna("U0")
data_df['Deck'] = data_df['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
data_df['Deck'] = data_df['Deck'].map(deck)
data_df['Deck'] = data_df['Deck'].fillna(0)
data_df['Deck'] = data_df['Deck'].astype(int)
# we can now drop the cabin feature
data_df = data_df.drop(['Cabin'], axis=1)
data_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,8
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,8
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,3
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,8
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,8
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,8
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,3


##### Age Feature
We can fill missing values of age with random numbers generated by the mean of the existing ages + standard deviation of existing ages

In [38]:
mean = data_df["Age"].mean()
std = data_df["Age"].std()

is_null = data_df["Age"].isnull().sum()
# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
# fill NaN values in Age column with random values generated
age_slice = data_df["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
data_df["Age"] = age_slice
data_df["Age"] = data_df["Age"].astype(int)
print('Number of null values in age feature:', data_df["Age"].isnull().sum())
data_df

Number of null values in age feature: 0


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22,1,0,7.2500,S,8
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71.2833,C,3
2,1,3,"Heikkinen, Miss. Laina",female,26,0,0,7.9250,S,8
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53.1000,S,3
4,0,3,"Allen, Mr. William Henry",male,35,0,0,8.0500,S,8
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27,0,0,13.0000,S,8
887,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,30.0000,S,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,38,1,2,23.4500,S,8
889,1,1,"Behr, Mr. Karl Howell",male,26,0,0,30.0000,C,3


##### Embarked Feature
Fill the 2 missing values with the most common one

In [39]:
most_common_val = data_df['Embarked'].describe()[2]
data_df['Embarked'] = data_df['Embarked'].fillna(most_common_val)
print('Number of null values in embarked feature:', data_df["Embarked"].isnull().sum())
data_df

Number of null values in embarked feature: 0


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22,1,0,7.2500,S,8
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71.2833,C,3
2,1,3,"Heikkinen, Miss. Laina",female,26,0,0,7.9250,S,8
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53.1000,S,3
4,0,3,"Allen, Mr. William Henry",male,35,0,0,8.0500,S,8
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27,0,0,13.0000,S,8
887,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,30.0000,S,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,38,1,2,23.4500,S,8
889,1,1,"Behr, Mr. Karl Howell",male,26,0,0,30.0000,C,3


## Transforming/Converting Features

#### Convert `Fare` Feature to INT (from float)

In [40]:
data_df['Fare'] = data_df['Fare'].astype(int)
data_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",male,22,1,0,7,S,8
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,71,C,3
2,1,3,"Heikkinen, Miss. Laina",female,26,0,0,7,S,8
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,53,S,3
4,0,3,"Allen, Mr. William Henry",male,35,0,0,8,S,8
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27,0,0,13,S,8
887,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,30,S,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,38,1,2,23,S,8
889,1,1,"Behr, Mr. Karl Howell",male,26,0,0,30,C,3


#### Extract Titles from `Name` Feature

Names are mostly unique for passangers and does not provide any extra information. However, the names contain titles such as Mr/Miss/Mrs/Master/Lady/Capt/Col etc. which can provide extra information

In [41]:
# titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

# # extract titles
# data_df['Title'] = data_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# # replace titles with a more common title or as Rare
# data_df['Title'] = data_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
#                                         'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
# data_df['Title'] = data_df['Title'].replace('Mlle', 'Miss')
# data_df['Title'] = data_df['Title'].replace('Ms', 'Miss')
# data_df['Title'] = data_df['Title'].replace('Mme', 'Mrs')
# # convert titles into numbers
# data_df['Title'] = data_df['Title'].map(titles)
# # filling NaN with 0, to get safe
# data_df['Title'] = data_df['Title'].fillna(0)
# data_df = data_df.drop(['Name'], axis=1)
# data_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,0,3,male,22,1,0,7,S,8,1
1,1,1,female,38,1,0,71,C,3,3
2,1,3,female,26,0,0,7,S,8,2
3,1,1,female,35,1,0,53,S,3,3
4,0,3,male,35,0,0,8,S,8,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27,0,0,13,S,8,5
887,1,1,female,19,0,0,30,S,2,2
888,0,3,female,38,1,2,23,S,8,2
889,1,1,male,26,0,0,30,C,3,1


#### Convert `Sex` Feature to numeric

In [42]:
genders = {"male": 0, "female": 1}
data_df['Sex'] = data_df['Sex'].map(genders)
data_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,0,3,0,22,1,0,7,S,8,1
1,1,1,1,38,1,0,71,C,3,3
2,1,3,1,26,0,0,7,S,8,2
3,1,1,1,35,1,0,53,S,3,3
4,0,3,0,35,0,0,8,S,8,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27,0,0,13,S,8,5
887,1,1,1,19,0,0,30,S,2,2
888,0,3,1,38,1,2,23,S,8,2
889,1,1,0,26,0,0,30,C,3,1


#### Convert `Embarked` Feature to numeric

In [43]:
ports = {"S": 0, "C": 1, "Q": 2}

data_df['Embarked'] = data_df['Embarked'].map(ports)
data_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,0,3,0,22,1,0,7,0,8,1
1,1,1,1,38,1,0,71,1,3,3
2,1,3,1,26,0,0,7,0,8,2
3,1,1,1,35,1,0,53,0,3,3
4,0,3,0,35,0,0,8,0,8,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27,0,0,13,0,8,5
887,1,1,1,19,0,0,30,0,2,2
888,0,3,1,38,1,2,23,0,8,2
889,1,1,0,26,0,0,30,1,3,1
