# Feature Engineering Tasks

From the tasks, it's apparent the data is being prepared for some ML analysis, therefore the tasks are performed keeping this objective in mind. However, the dataset is not being partitioned into the testing and training sets as it is not a requirement in the task.

## Importing Relevant Libraries and Dataset

In [18]:
# Importing Libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

In [19]:
# Load the dataset
data = pd.read_csv("../data/titanic.csv")

# Make a copy of the dataset
copy = data.copy()

## Sub-Task 1: Data Cleaning

### Basic Cleaning

In [20]:
# Considering the final use case of the dataset, I will not attach a primary key to the dataset (which is generally the first step otherwise)

In [21]:
# Removing last row as it contains all null values

data = data.iloc[:-1, :]

In [22]:
# Removing columns that can not affect the survival (attributes related to the time after the accident)

drop_atts=['boat','body','home.dest']
data.drop(drop_atts, inplace=True, axis=1)
data.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S
5,1.0,1.0,"Anderson, Mr. Harry",male,48.0,0.0,0.0,19952,26.55,E12,S
6,1.0,1.0,"Andrews, Miss. Kornelia Theodosia",female,63.0,1.0,0.0,13502,77.9583,D7,S
7,1.0,0.0,"Andrews, Mr. Thomas Jr",male,39.0,0.0,0.0,112050,0.0,A36,S
8,1.0,1.0,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2.0,0.0,11769,51.4792,C101,S
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C


In [23]:
# Check missing values

for col in data.columns.tolist():          
    print('{} column missing values: {}'.format(col, data[col].isnull().sum()))
print('\n')

pclass column missing values: 0
survived column missing values: 0
name column missing values: 0
sex column missing values: 0
age column missing values: 263
sibsp column missing values: 0
parch column missing values: 0
ticket column missing values: 0
fare column missing values: 1
cabin column missing values: 1014
embarked column missing values: 2




### Age

In [24]:
# Checking correlation between of age with other numerical features

data_corr = data[['pclass', 'survived', 'sibsp', 'parch', 'fare', 'age']].corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
data_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
data_corr[data_corr['Feature 1'] == 'age']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
5,age,age,1.0
8,age,pclass,0.408106
17,age,sibsp,0.243699
21,age,fare,0.178739
24,age,parch,0.150917
30,age,survived,0.055513


In [25]:
# Using the most correlated feature (pclass) to impute the missing values of age, sex is used as a secondary grouping feature

age_groups = data[['sex', 'age', 'pclass']].groupby(['sex', 'pclass']).median()['age'] # Using median instead of mean because of age having a right skew

for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print('Median age of pclass {} {}s: {}'.format(pclass, sex, age_groups[sex][pclass])) # Confirms that using age as a second grouping feature has impact
print('Median age of all passengers: {}'.format(data['age'].median()))

# Create a new column 'age_fill' to hold the filled values
age_groups = age_groups.reset_index()
median_age_dict = age_groups.set_index(['sex', 'pclass']).to_dict()['age']
data['age_fill'] = data.apply(lambda row: median_age_dict.get((row['sex'], row['pclass']), row['age']), axis=1)

# Use 'age_fill' to fill missing values in the 'age' column
data['age'] = data['age'].fillna(data['age_fill'])
data = data.drop(columns=['age_fill'])

# Confirm that there are no more missing values in the 'age' column
print("\nMissing ages: ", data['age'].isnull().sum())

Median age of pclass 1 females: 36.0
Median age of pclass 1 males: 42.0
Median age of pclass 2 females: 28.0
Median age of pclass 2 males: 29.5
Median age of pclass 3 females: 22.0
Median age of pclass 3 males: 25.0
Median age of all passengers: 28.0

Missing ages:  0


### Embarked

In [26]:
# Check which are missing

data[data['embarked'].isnull()]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
168,1.0,1.0,"Icard, Miss. Amelie",female,38.0,0.0,0.0,113572,80.0,B28,
284,1.0,1.0,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0.0,0.0,113572,80.0,B28,


In [27]:
# Filling the missing values by researching the actual values (which are coincodentally also the most common values)

# Trying to predict the embarked port from any other column shows no logical basis as the embarked port does not depend on any other column. Since this is the data of the Titanic, we can use the internet to find a such missing value. A quick google search shows that both of the people embarked from Southampton.

# Filling the missing values in embarked with S
data['embarked'] = data['embarked'].fillna('S')

# Confirm that there are no more missing values in the 'embarked' column
print("\nMissing embarked: ", data['embarked'].isnull().sum())


Missing embarked:  0


### Fare

In [28]:
# Checking correlation between of fare with other numerical features

data_corr = data[['pclass', 'survived', 'sibsp', 'parch', 'fare', 'age']].corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
data_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
data_corr[data_corr['Feature 1'] == 'fare']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
2,fare,fare,1.0
7,fare,pclass,0.558629
15,fare,survived,0.244265
17,fare,parch,0.221539
20,fare,age,0.200038
22,fare,sibsp,0.160238


In [29]:
# There is no clear correlation with any other feature except pclass (which was expected), however the people of the same class have different fares. This can be because of the different cabins and the different number of people in each cabin. So, we will use the median fare of each pclass along with their accompanying family as second grouping feature to fill the missing values.

data[data['fare'].isnull()]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
1225,3.0,0.0,"Storey, Mr. Thomas",male,60.5,0.0,0.0,3701,,,S


In [30]:
# Using the most correlated feature (pclass) to impute the missing values of fare, with parch and sibsp used as secondary grouping features

# Filling the missing value in Fare with the median Fare of 3rd class alone passenger
med_fare = data.groupby(['pclass', 'parch', 'sibsp']).fare.median()[3][0][0] # Using median instead of mean because of fare having a right skew
data['fare'] = data['fare'].fillna(med_fare)

# Confirm that there are no more missing values in the 'fare' column
print("\nMissing fares: ", data['fare'].isnull().sum())


Missing fares:  0


### Cabin

In [31]:
# Reasoning:

# Such a column having this many missing values (> 70%) is usually dropped. Either way, I saw the following options:
# 1. Drop the column
# 2. Turn the column into a binary column (1 if cabin is present, 0 if not)
# 3. Fill some values according to people having the same ticket number, and the rest with N/A (or something similar)
# 4. Research the titanic for hints about the cabin numbers and how to fill them

# According to EDA, the column is important. So dropping it is not an option. 
# The fourth option is not feasible in real life problems, it is only possible in this case because the dataset is about the Titanic (which is a well researched topic). 
# The third option fills only about 1/3rd of the values; categorizing the rest as the same category creates a highly skewed distribution.
# The second option is the best option in this case, as it fills all the values and does not create a highly skewed distribution, ensuring we do not lose an important feature while retaining its usability.

# Since the selected option falls under the category of feature extraction, for now we will just fill the missing values with a new category NA.

In [32]:
# Fill missing values with 'NA'
data.cabin = data.cabin.fillna('NA')

# Confirm that there are no more missing values in the 'cabin' column
print("\nMissing cabin: ", data['cabin'].isnull().sum())


Missing cabin:  0


### Final Confirmation

In [33]:
# Confirm that there are no more missing values in the dataset
print("\nRemaining missing values in complete dataset: ", data.isnull().sum().sum())


Remaining missing values in complete dataset:  0


## Sub-Task 2: Feature Extraction

### Title

In [None]:
# As hinted in the task description too



### Family Size

In [35]:
# This is the sum of the number of siblings/spouses and the number of parents/children.
data['family_size'] = data['sibsp'] + data['parch'] + 1

# Drop the parch and sibsp columns
data.drop(['sibsp', 'parch'], axis=1, inplace=True)

data.head(10)

Unnamed: 0,pclass,survived,name,sex,age,ticket,fare,cabin,embarked,family_size
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,24160,211.3375,B5,S,1.0
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,113781,151.55,C22 C26,S,4.0
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,113781,151.55,C22 C26,S,4.0
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,113781,151.55,C22 C26,S,4.0
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,113781,151.55,C22 C26,S,4.0
5,1.0,1.0,"Anderson, Mr. Harry",male,48.0,19952,26.55,E12,S,1.0
6,1.0,1.0,"Andrews, Miss. Kornelia Theodosia",female,63.0,13502,77.9583,D7,S,2.0
7,1.0,0.0,"Andrews, Mr. Thomas Jr",male,39.0,112050,0.0,A36,S,1.0
8,1.0,1.0,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,11769,51.4792,C101,S,3.0
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,PC 17609,49.5042,,C,1.0


### Has Cabin Number

In [None]:
# Extension of cleaning of the cabin column

## Sub-Task 3: Age Grouping

## Sub-Task 4: Fare Binning

## Sub-Task 5: Encoding Categorical Variables

## Sub-Task 6: Correlation Analysis