In [23]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [24]:
# Data exploration: Variable identification, feature exploration, missing values treatment.

In [25]:
df = pd.read_csv('Data/train.csv')

In [26]:
print(df.shape)

display(df.head())

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Empty values

In [27]:
# Display percentag ae of empty values.
percentage_missing_values = df.isna().sum().transform(lambda x: (x/df.shape[0])*100)
display(percentage_missing_values[percentage_missing_values > 0])

Age         19.865320
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [28]:
# Age seems usable -> Fill with mean or more advance repartition.
# Cabin -> Too high percentage of missing values -> not usable.
# Embarked -> Categorical data, filling with mode (Starting harbor of the titanic).

In [29]:
# Feature exploration

In [30]:
# Name column

# Feature engineering: The name column seems to contain a "Rank/Grade" component (Mr, Miss...).

In [31]:
df['Name'].apply(lambda x: x.split('.')[0].split(' ')[-1]).value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Name, dtype: int64

In [32]:
# Several values appearing quite often with some other unique values.
# To transform into a valid feature: Need to set categories and handle the single values (Other column, for example).

In [33]:
# Defined set of accepted ranks and some transformation to fit in those categories:
accepted_ranks = ['Mr', 'Mrs', 'Miss', 'Master', 'Other']
rank_transform = {'Mlle': 'Miss', 'Ms': 'Mrs', 'Mme': 'Mrs'}

In [34]:
df['Rank'] = pd.Categorical(df['Name'].apply(lambda x: x.split('.')[0].split(' ')[-1]).replace(rank_transform), categories=accepted_ranks).fillna('Other')

In [35]:
df['Rank'].value_counts()

Mr        517
Miss      184
Mrs       127
Master     40
Other      23
Name: Rank, dtype: int64

In [36]:
# Ticket column seems unusable.

print(f"Number of unique values: {df['Ticket'].nunique()}")
print(f"{df['Ticket'].head()}")

# No clear valid stucture and thus possible added value.

Number of unique values: 681
0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object


In [37]:
# Preprocessing for ML:

In [39]:
df = df.drop(columns=['Name', 'Ticket', 'Cabin'])

# Basic data imputation Age from mean, Embarked from mode.
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [40]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Rank
0,1,0,3,male,22.0,1,0,7.25,S,Mr
1,2,1,1,female,38.0,1,0,71.2833,C,Mrs
2,3,1,3,female,26.0,0,0,7.925,S,Miss
3,4,1,1,female,35.0,1,0,53.1,S,Mrs
4,5,0,3,male,35.0,0,0,8.05,S,Mr
