# Preprocessing - Predicting Survival on the Titanic

In [None]:
import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme()
sns.set_style('whitegrid')
sns.set_palette(['#FF5F5D', '#3F7C85', '#00CCBF', '#72F2EB', '#747E7E'])

# import warnings
# warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('datasets/titanic/train.csv')
df_test = pd.read_csv('datasets/titanic/test.csv')

In [None]:
df_train.info()

## 1. Feature Engineering

### 1.1 Whats in a name?

In [None]:
df_train['Name'].head(10)

In [None]:
import re
df_train['Title'] = df_train['Name'].apply(lambda x: re.search('([A-Z][a-z]+)\.', x).group(1))

In [None]:
df_train[['Name', 'Title']].head(10)

In [None]:
df_train['Title'].value_counts()

In France, one traditionally calls a young, unmarried woman Mademoiselle – **Mlle** for short – and an older, married woman _Madame_, whose abbreviation is **Mme**.

In [None]:
df_train['Title'] = df_train['Title'].replace('Mlle', 'Miss')
df_train['Title'] = df_train['Title'].replace('Mme', 'Mrs')
df_train.loc[(~df_train['Title'].isin(['Mr', 'Mrs', 'Miss', 'Master'])), 'Title'] = 'Rare Title'

In [None]:
df_train['Title'].value_counts()

In [None]:
# perform the same task on the test dataset
df_test['Title'] = df_test['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
df_test['Title'] = df_test['Title'].replace('Mlle', 'Miss')
df_test['Title'] = df_test['Title'].replace('Mme', 'Mrs')
df_test.loc[(~df_test['Title'].isin(['Mr', 'Mrs', 'Miss', 'Master'])), 'Title'] = 'Rare Title'

In [None]:
df_test['Title'].unique()

### 1.2 Travelling Alone vs. Travelling in groups

#### 1.2.1 Family Size

In [None]:
df_train['Fsize'] = df_train['SibSp'] + df_train['Parch'] + 1

In [None]:
sns.countplot(df_train, x='Fsize', hue='Survived');

In [None]:
# t_ = df_train.groupby('Fsize')['Survived'].value_counts(normalize=True)
# t_

In [None]:
temp = df_train.groupby('Fsize')['Survived'].value_counts(normalize=True).reset_index(name='Perc')
sns.barplot(temp, x='Fsize', y='Perc', hue='Survived');

#### 1.2.2 Apart from families, there could be groups of friends travelling together.

In [None]:
len(df_train['Ticket'].unique())

In [None]:
temp = df_train['Ticket'].value_counts().reset_index(name='Tsize')
df_train = df_train.merge(temp, left_on='Ticket', right_on='Ticket', how='inner')

In [None]:
df_train.head(20)

In [None]:
sns.countplot(df_train, x='Tsize', hue='Survived');

In [None]:
temp = df_train.groupby('Tsize')['Survived'].value_counts(normalize=True).reset_index(name='Perc')
sns.barplot(temp, x='Tsize', y='Perc', hue='Survived');

#### 1.2.3 Consolidating Family and Friends

In [None]:
df_train['Group'] = df_train[['Tsize', 'Fsize']].max(axis=1)
sns.countplot(df_train, x='Group', hue='Survived');

In [None]:
df_train['GrpSize'] = ''
df_train.loc[df_train['Group']==1, 'GrpSize'] = df_train.loc[df_train['Group']==1, 'GrpSize'].replace('', 'solo')
df_train.loc[df_train['Group']==2, 'GrpSize'] = df_train.loc[df_train['Group']==2, 'GrpSize'].replace('', 'couple')
df_train.loc[(df_train['Group']<=4) & (df_train['Group']>=3), 'GrpSize'] = df_train.loc[(df_train['Group']<=4) & (df_train['Group']>=3), 'GrpSize'].replace('', 'group')
df_train.loc[df_train['Group']>4, 'GrpSize'] = df_train.loc[df_train['Group']>4, 'GrpSize'].replace('', 'large group')
df_train.head()

In [None]:
sns.countplot(df_train, x='GrpSize', hue='Survived', order=['solo', 'couple', 'group', 'large group'])

In [None]:
# Applying same transformation to test dataset

df_test['Fsize'] = df_test['SibSp'] + df_test['Parch']+1
temp = df_test['Ticket'].value_counts().reset_index(name='Tsize')
df_test = df_test.merge(temp, left_on='Ticket', right_on='Ticket',how='inner')
df_test['Group'] = df_test[['Tsize', 'Fsize']].max(axis=1)
df_test['GrpSize'] = ''
df_test.loc[df_test['Group']==1, 'GrpSize'] = df_test.loc[df_test['Group']==1, 'GrpSize'].replace('', 'solo')
df_test.loc[df_test['Group']==2, 'GrpSize'] = df_test.loc[df_test['Group']==2, 'GrpSize'].replace('', 'couple')
df_test.loc[(df_test['Group']<=4) & (df_test['Group']>=3), 'GrpSize'] = df_test.loc[(df_test['Group']<=4) & (df_test['Group']>=3), 'GrpSize'].replace('', 'group')
df_test.loc[df_test['Group']>4, 'GrpSize'] = df_test.loc[df_test['Group']>4, 'GrpSize'].replace('', 'large group')
df_test.head()

### 1.3 Effect of Fare

In [None]:
df_train['Fare'].isnull().sum()

In [None]:
sns.displot(df_train, x='Fare', kind="kde")

In [None]:
len(df_train[df_train['Fare'] < 0])

In [None]:
len(df_train[df_train['Fare'] == 0])

In [None]:
df_train.loc[(df_train['Fare'] == 0) & (df_train['Pclass'] == 1), 'Fare'] = df_train[df_train['Pclass'] == 1]['Fare'].median()
df_train.loc[(df_train['Fare'] == 0) & (df_train['Pclass'] == 2), 'Fare'] = df_train[df_train['Pclass'] == 2]['Fare'].median()
df_train.loc[(df_train['Fare'] == 0) & (df_train['Pclass'] == 3), 'Fare'] = df_train[df_train['Pclass'] == 3]['Fare'].median()

In [None]:
df_train['Fare'].describe()

In [None]:
df_train['FareCat'] = ''
df_train.loc[df_train['Fare']<=10, 'FareCat'] = '0-10'
df_train.loc[(df_train['Fare']>10) & (df_train['Fare']<=25), 'FareCat'] = '10-25'
df_train.loc[(df_train['Fare']>25) & (df_train['Fare']<=40), 'FareCat'] = '25-40'
df_train.loc[(df_train['Fare']>40) & (df_train['Fare']<=70), 'FareCat'] = '40-70'
df_train.loc[(df_train['Fare']>70) & (df_train['Fare']<=100), 'FareCat'] = '70-100'
df_train.loc[df_train['Fare']>100, 'FareCat'] = '100+'

In [None]:
df_train[['Fare', 'FareCat']].head(10)

In [None]:
sns.countplot(df_train, x='FareCat', hue='Survived', order=['0-10', '10-25', '25-40', '40-70', '70-100', '100+']);

In [None]:
temp = df_train.groupby(['FareCat', 'Survived'])['Survived'].count().reset_index(name='count')
sns.lineplot(temp, x='FareCat', y='count', hue='Survived');

In [None]:
# Applying same changes on test data set
df_test.loc[(df_test['Fare'] == 0) & (df_test['Pclass'] == 1), 'Fare'] = df_test[df_test['Pclass'] == 1]['Fare'].mean()
df_test.loc[(df_test['Fare'] == 0) & (df_test['Pclass'] == 2), 'Fare'] = df_test[df_test['Pclass'] == 2]['Fare'].mean()
df_test.loc[(df_test['Fare'] == 0) & (df_test['Pclass'] == 3), 'Fare'] = df_test[df_test['Pclass'] == 3]['Fare'].mean()
df_test['FareCat'] = ''
df_test.loc[df_test['Fare']<=10, 'FareCat'] = '0-10'
df_test.loc[(df_test['Fare']>10) & (df_test['Fare']<=25), 'FareCat'] = '10-25'
df_test.loc[(df_test['Fare']>25) & (df_test['Fare']<=40), 'FareCat'] = '25-40'
df_test.loc[(df_test['Fare']>40) & (df_test['Fare']<=70), 'FareCat'] = '40-70'
df_test.loc[(df_test['Fare']>70) & (df_test['Fare']<=100), 'FareCat'] = '70-100'
df_test.loc[df_test['Fare']>100, 'FareCat'] = '100+'

In [None]:
# df_test[['Fare', 'FareCat']].head(10)

## 2 Missing Values

In [None]:
df_train.isnull().sum()

In [None]:
sns.heatmap(df_train.isnull(), cbar=False)

### 2.1 Embarked Variable

In [None]:
df_train['Embarked'].value_counts()

In [None]:
df_train.loc[(df_train['Embarked'].isnull()),'Embarked']='S'

### 2.2 Age

In [None]:
sns.kdeplot(df_train[df_train['Survived'] == 0]['Age'], fill=True,label="Not survived");
sns.kdeplot(df_train[df_train['Survived'] == 1]['Age'], fill=True,label="Survived");

In [None]:
sns.kdeplot(df_train, x="Age", hue="Survived", multiple="fill");

In [None]:
df_train.head()

In [None]:
sns.catplot(df_train, kind='box', x='Age', col='Title', row='Pclass');

In [None]:
for t in df_train['Title'].unique():
    for p in df_train['Pclass'].unique():
        df_train.loc[(df_train['Title'] == t) & (df_train['Pclass'] == p) & (df_train['Age'].isnull()), 'Age'] = df_train.loc[(df_train['Title'] == t) & (df_train['Pclass'] == p), 'Age'].median()

In [None]:
df_train["Age"].isnull().sum()

In [None]:
df_train['AgeCat']=''
df_train.loc[ df_train['Age'] <= 16, 'AgeCat'] = '0-16'
df_train.loc[(df_train['Age'] > 16) & (df_train['Age'] <= 32), 'AgeCat'] = '16-32'
df_train.loc[(df_train['Age'] > 32) & (df_train['Age'] <= 48), 'AgeCat'] = '32-48'
df_train.loc[(df_train['Age'] > 48) & (df_train['Age'] <= 64), 'AgeCat'] = '48-64'
df_train.loc[ df_train['Age'] > 64, 'AgeCat']= '64+'

### 2.3 Cabin

In [None]:
df_train.drop('Cabin',axis=1,inplace=True)

In [None]:
df_train.isnull().sum()

In [None]:
# drop cabin from test data as well
df_test.drop('Cabin',axis=1,inplace=True)

### 2.4 Filling missing values in the test data

In [None]:
df_test.isnull().sum()

In [None]:
for t in df_test['Title'].unique():
    for p in df_test['Pclass'].unique():
        df_test.loc[(df_test['Title'] == t) & (df_test['Pclass'] == p) & (df_test['Age'].isnull()), 'Age'] = df_test.loc[(df_test['Title'] == t) & (df_test['Pclass'] == p), 'Age'].median()

In [None]:
df_test.loc[(df_test["Age"].isnull()), "Age"] = df_test[df_test['Pclass'] == 3]['Age'].median()

In [None]:
df_test['AgeCat']=''
df_test.loc[ df_test['Age'] <= 16, 'AgeCat'] = '0-16'
df_test.loc[(df_test['Age'] > 16) & (df_test['Age'] <= 32), 'AgeCat'] = '16-32'
df_test.loc[(df_test['Age'] > 32) & (df_test['Age'] <= 48), 'AgeCat'] = '32-48'
df_test.loc[(df_test['Age'] > 48) & (df_test['Age'] <= 64), 'AgeCat'] = '48-64'
df_test.loc[ df_test['Age'] > 64, 'AgeCat']= '64+'

In [None]:
df_test[df_test["Fare"].isnull()]

In [None]:
df_test.loc[(df_test["Fare"].isnull()), "Fare"] = df_test[df_test['Pclass'] == 3]['Fare'].median()

In [None]:
df_test.iloc[172]['Fare']

In [None]:
df_test.loc[172, 'FareCat'] = '0-10'

In [None]:
df_test.loc[172]

## 3. Dropping irrelevant/ redundant attributes

In [None]:
df_train.head()

In [None]:
drop_features = ['PassengerId', 'Name','Age', 'SibSp', 'Parch','Ticket', 'Fare','Fsize','Tsize', 'Group']

In [None]:
df_train.drop(drop_features, axis=1, inplace=True)
df_test.drop(drop_features, axis=1, inplace=True)

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.to_csv('datasets/titanic/df_train_clean.csv', index=False)
df_test.to_csv('datasets/titanic/df_test_clean.csv', index=False)

**References**
- https://www.kaggle.com/code/startupsci/titanic-data-science-solutions/notebook
- https://www.kaggle.com/code/surajkumar88/titanic-machine-learning-from-disaster-eda/notebook
- https://www.kaggle.com/code/pythonafroz/titanic-survival-prediction-with-11-algorithm
- https://www.kaggle.com/code/subinium/awesome-visualization-with-titanic-dataset