## Imports

In [97]:
import pandas as pd

## Read Data

In [98]:
data_df = pd.read_csv("titanic_preprocessed.csv")
data_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,"Braund, Mr. Owen Harris",0,2,1,0,0,0,8
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,5,1,0,5,1,3
2,1,3,"Heikkinen, Miss. Laina",1,3,0,0,0,0,8
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,5,1,0,5,0,3
4,0,3,"Allen, Mr. William Henry",0,5,0,0,1,0,8
...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",0,3,0,0,2,0,8
887,1,1,"Graham, Miss. Margaret Edith",1,2,0,0,4,0,2
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,4,1,2,3,0,8
889,1,1,"Behr, Mr. Karl Howell",0,3,0,0,4,1,3


## Feature Engineering

#### Extract `Titles` from `Name` Feature

Names are mostly unique for passangers and does not provide any extra information. However, the names contain titles such as Mr/Miss/Mrs/Master/Lady/Capt/Col etc. which can provide extra information

In [99]:
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

# extract titles
data_df['Title'] = data_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# replace titles with a more common title or as Rare
data_df['Title'] = data_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                        'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data_df['Title'] = data_df['Title'].replace('Mlle', 'Miss')
data_df['Title'] = data_df['Title'].replace('Ms', 'Miss')
data_df['Title'] = data_df['Title'].replace('Mme', 'Mrs')
# convert titles into numbers
data_df['Title'] = data_df['Title'].map(titles)
# filling NaN with 0, to get safe
data_df['Title'] = data_df['Title'].fillna(0)
data_df = data_df.drop(['Name'], axis=1)
data_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
0,0,3,0,2,1,0,0,0,8,1
1,1,1,1,5,1,0,5,1,3,3
2,1,3,1,3,0,0,0,0,8,2
3,1,1,1,5,1,0,5,0,3,3
4,0,3,0,5,0,0,1,0,8,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,3,0,0,2,0,8,5
887,1,1,1,2,0,0,4,0,2,2
888,0,3,1,4,1,2,3,0,8,2
889,1,1,0,3,0,0,4,1,3,1


#### Crossing Feature `Age` and `Pclass`

In [100]:
data_df['Age_Class']= data_df['Age']* data_df['Pclass']
data_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class
0,0,3,0,2,1,0,0,0,8,1,6
1,1,1,1,5,1,0,5,1,3,3,5
2,1,3,1,3,0,0,0,0,8,2,9
3,1,1,1,5,1,0,5,0,3,3,5
4,0,3,0,5,0,0,1,0,8,1,15
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,3,0,0,2,0,8,5,6
887,1,1,1,2,0,0,4,0,2,2,2
888,0,3,1,4,1,2,3,0,8,2,12
889,1,1,0,3,0,0,4,1,3,1,3


#### Combining Feature `SibSp` and `Parch`

In [101]:

data_df['Relatives'] = data_df['SibSp'] + data_df['Parch']
data_df.loc[data_df['Relatives'] > 0, 'Not_alone'] = 0
data_df.loc[data_df['Relatives'] == 0, 'Not_alone'] = 1
data_df['Not_alone'] = data_df['Not_alone'].astype(int)

# Drop the old features?
# data_df = data_df.drop(['SibSp', 'Parch'], axis=1)

# Check the new ones
print(data_df['Not_alone'].value_counts())
data_df

1    537
0    354
Name: Not_alone, dtype: int64


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,Relatives,Not_alone
0,0,3,0,2,1,0,0,0,8,1,6,1,0
1,1,1,1,5,1,0,5,1,3,3,5,1,0
2,1,3,1,3,0,0,0,0,8,2,9,0,1
3,1,1,1,5,1,0,5,0,3,3,5,1,0
4,0,3,0,5,0,0,1,0,8,1,15,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,3,0,0,2,0,8,5,6,0,1
887,1,1,1,2,0,0,4,0,2,2,2,0,1
888,0,3,1,4,1,2,3,0,8,2,12,3,0
889,1,1,0,3,0,0,4,1,3,1,3,0,1


#### Crossing Features `Relatives` and `Fare` (Fare per Person)

In [102]:
data_df['Fare_Per_Person'] = data_df['Fare']/(data_df['Relatives']+1)
data_df['Fare_Per_Person'] = data_df['Fare_Per_Person'].astype(int)
data_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,Relatives,Not_alone,Fare_Per_Person
0,0,3,0,2,1,0,0,0,8,1,6,1,0,0
1,1,1,1,5,1,0,5,1,3,3,5,1,0,2
2,1,3,1,3,0,0,0,0,8,2,9,0,1,0
3,1,1,1,5,1,0,5,0,3,3,5,1,0,2
4,0,3,0,5,0,0,1,0,8,1,15,0,1,1
5,0,3,0,3,0,0,1,2,8,1,9,0,1,1
6,0,1,0,6,0,0,4,0,5,1,6,0,1,4
7,0,3,0,0,3,1,3,0,8,4,0,4,0,0
8,1,3,1,3,0,2,2,0,8,3,9,2,0,0
9,1,2,1,1,1,0,4,1,8,3,2,1,0,2


## Summarize and Save Engineered Dataset

#### Look at that nice dataset :)

In [104]:
display(data_df.head(10))
display(data_df.tail(10))
display(data_df)
display(data_df.describe())

total = data_df.isnull().sum().sort_values(ascending=False)
percent_1 = data_df.isnull().sum()/data_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
display(missing_data)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,Relatives,Not_alone,Fare_Per_Person
0,0,3,0,2,1,0,0,0,8,1,6,1,0,0
1,1,1,1,5,1,0,5,1,3,3,5,1,0,2
2,1,3,1,3,0,0,0,0,8,2,9,0,1,0
3,1,1,1,5,1,0,5,0,3,3,5,1,0,2
4,0,3,0,5,0,0,1,0,8,1,15,0,1,1
5,0,3,0,3,0,0,1,2,8,1,9,0,1,1
6,0,1,0,6,0,0,4,0,5,1,6,0,1,4
7,0,3,0,0,3,1,3,0,8,4,0,4,0,0
8,1,3,1,3,0,2,2,0,8,3,9,2,0,0
9,1,2,1,1,1,0,4,1,8,3,2,1,0,2


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,Relatives,Not_alone,Fare_Per_Person
881,0,3,0,4,0,0,0,0,8,1,12,0,1,0
882,0,3,1,2,0,0,2,0,8,2,6,0,1,2
883,0,2,0,4,0,0,2,0,8,1,8,0,1,2
884,0,3,0,3,0,0,0,0,8,1,9,0,1,0
885,0,3,1,5,0,5,4,2,8,3,15,5,0,0
886,0,2,0,3,0,0,2,0,8,5,6,0,1,2
887,1,1,1,2,0,0,4,0,2,2,2,0,1,4
888,0,3,1,4,1,2,3,0,8,2,12,3,0,0
889,1,1,0,3,0,0,4,1,3,1,3,0,1,4
890,0,3,0,4,0,0,0,2,8,1,12,0,1,0


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,Relatives,Not_alone,Fare_Per_Person
0,0,3,0,2,1,0,0,0,8,1,6,1,0,0
1,1,1,1,5,1,0,5,1,3,3,5,1,0,2
2,1,3,1,3,0,0,0,0,8,2,9,0,1,0
3,1,1,1,5,1,0,5,0,3,3,5,1,0,2
4,0,3,0,5,0,0,1,0,8,1,15,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,3,0,0,2,0,8,5,6,0,1,2
887,1,1,1,2,0,0,4,0,2,2,2,0,1,4
888,0,3,1,4,1,2,3,0,8,2,12,3,0,0
889,1,1,0,3,0,0,4,1,3,1,3,0,1,4


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,Age_Class,Relatives,Not_alone,Fare_Per_Person
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,3.503928,0.523008,0.381594,2.333333,0.361392,6.936027,1.728395,7.650954,0.904602,0.602694,1.295174
std,0.486592,0.836071,0.47799,1.87195,1.102743,0.806057,1.806978,0.635673,2.074282,1.030039,4.873737,1.613459,0.489615,1.385871
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,1.0,4.0,0.0,0.0,0.0
50%,0.0,3.0,0.0,4.0,0.0,0.0,2.0,0.0,8.0,1.0,6.0,0.0,1.0,1.0
75%,1.0,3.0,1.0,5.0,1.0,0.0,4.0,1.0,8.0,2.0,12.0,1.0,1.0,2.0
max,1.0,3.0,1.0,6.0,8.0,6.0,5.0,2.0,8.0,5.0,18.0,10.0,1.0,5.0


Unnamed: 0,Total,%
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
Age,0,0.0
SibSp,0,0.0
Parch,0,0.0
Fare,0,0.0
Embarked,0,0.0
Deck,0,0.0
Title,0,0.0


#### Save the feature engineered dataset

In [105]:
data_df.to_csv('titanic_engineered.csv', index=False, index_label=None)