In [189]:
# The NameLength, TicketLength, AgeMissing and TicketFirstCharacter features were inspired by
# https://www.kaggle.com/code/zlatankr/titanic-random-forest-82-78

# The FamilySize feature was inspired by
# https://www.kaggle.com/code/vincentlugat/titanic-neural-networks-keras-81-8

In [190]:
# Imports

import pandas as pd
import numpy as np

In [191]:
# Set max number of columns displayed

pd.set_option('display.max_columns', 100)

In [192]:
# Load data

train = pd.read_csv('inputs/train.csv')
test = pd.read_csv('inputs/test.csv')

In [193]:
# Label train and test data

train['Label'] = 'train'
test['Label'] = 'test'

# Concatenate train and test data

df = pd.concat([train, test])

In [194]:
# Create features matrix

X = df.drop(['Survived', 'PassengerId'], axis=1)

X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Label
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


In [195]:
# Feature engineering - Class

X['FirstClass'] = np.where(X['Pclass'] == 1, 1, 0)
X['SecondClass'] = np.where(X['Pclass'] == 2, 1, 0)

X[['Pclass', 'FirstClass']].head()

Unnamed: 0,Pclass,FirstClass
0,3,0
1,1,1
2,3,0
3,1,1
4,3,0


In [196]:
# Feature engineering - Title

X['Title'] = X['Name'].str.extract('([A-Za-z]+)\.', expand=False)

X['Title'] = X['Title'].replace({'Mme': 'Mrs',
                                 'Ms': 'Miss',
                                 'Mlle': 'Miss'})

X['Mr'] = np.where(X['Title'] == 'Mr', 1, 0)
X['Miss'] = np.where(X['Title'] == 'Miss', 1, 0)
X['Mrs'] = np.where(X['Title'] == 'Mrs', 1, 0)
X['Master'] = np.where(X['Title'] == 'Master', 1, 0)

X[['Name', 'Title', 'Mr']].head()

Unnamed: 0,Name,Title,Mr
0,"Braund, Mr. Owen Harris",Mr,1
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs,0
2,"Heikkinen, Miss. Laina",Miss,0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs,0
4,"Allen, Mr. William Henry",Mr,1


In [197]:
# Feature engineering - Name length

X['NameLength'] = X['Name'].str.len()

X[['Name', 'NameLength']].head()

Unnamed: 0,Name,NameLength
0,"Braund, Mr. Owen Harris",23
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",51
2,"Heikkinen, Miss. Laina",22
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",44
4,"Allen, Mr. William Henry",24


In [198]:
# Feature engineering - Sex

X['Female'] = np.where(X['Sex'] == 'female', 1, 0)

X[['Sex', 'Female']].head()

Unnamed: 0,Sex,Female
0,male,0
1,female,1
2,female,1
3,female,1
4,male,0


In [199]:
# Feature engineering - Age data missing

X['AgeMissing'] = np.where(X['Age'].isna(), 1, 0)

X[['Age', 'AgeMissing']].head(7)

Unnamed: 0,Age,AgeMissing
0,22.0,0
1,38.0,0
2,26.0,0
3,35.0,0
4,35.0,0
5,,1
6,54.0,0


In [200]:
# Feature engineering - Child

X['Child'] = np.where(X['Age'] < 18, 1, 0)

X[['Age', 'Child']].head(9)

Unnamed: 0,Age,Child
0,22.0,0
1,38.0,0
2,26.0,0
3,35.0,0
4,35.0,0
5,,0
6,54.0,0
7,2.0,1
8,27.0,0


In [201]:
# Feature engineering - Family size

X['FamilySize'] = X['Parch'] + X['SibSp'] + 1

X['FamilySizeCat'] = np.where(X['FamilySize'] > 1, 'Small', 'Alone')
X['FamilySizeCat'] = np.where(X['FamilySize'] > 4, 'Big', X['FamilySizeCat'])

X['FamilySizeSmall'] = np.where(X['FamilySizeCat'] == 'Small', 1, 0)
X['FamilySizeBig'] = np.where(X['FamilySizeCat'] == 'Big', 1, 0)

X[['Parch', 'SibSp', 'FamilySize', 'FamilySizeCat', 'FamilySizeSmall']].head(9)

Unnamed: 0,Parch,SibSp,FamilySize,FamilySizeCat,FamilySizeSmall
0,0,1,2,Small,1
1,0,1,2,Small,1
2,0,0,1,Alone,0
3,0,1,2,Small,1
4,0,0,1,Alone,0
5,0,0,1,Alone,0
6,0,0,1,Alone,0
7,1,3,5,Big,0
8,2,0,3,Small,1


In [202]:
# Feature engineering - Ticket number length

X['TicketLength'] = X['Ticket'].str.len()

X[['Ticket', 'TicketLength']].head()

Unnamed: 0,Ticket,TicketLength
0,A/5 21171,9
1,PC 17599,8
2,STON/O2. 3101282,16
3,113803,6
4,373450,6


In [203]:
# Feature engineering - First character of ticket number

X['TicketFirstCharacter'] = X['Ticket'].str[0]

X['TicketFirstCharacter3'] = np.where(X['TicketFirstCharacter'] == '3', 1, 0)
X['TicketFirstCharacter2'] = np.where(X['TicketFirstCharacter'] == '2', 1, 0)
X['TicketFirstCharacter1'] = np.where(X['TicketFirstCharacter'] == '1', 1, 0)
X['TicketFirstCharacterP'] = np.where(X['TicketFirstCharacter'] == 'P', 1, 0)
X['TicketFirstCharacterS'] = np.where(X['TicketFirstCharacter'] == 'S', 1, 0)
X['TicketFirstCharacterC'] = np.where(X['TicketFirstCharacter'] == 'C', 1, 0)
X['TicketFirstCharacterA'] = np.where(X['TicketFirstCharacter'] == 'A', 1, 0)
X['TicketFirstCharacterW'] = np.where(X['TicketFirstCharacter'] == 'W', 1, 0)
X['TicketFirstCharacter4'] = np.where(X['TicketFirstCharacter'] == '4', 1, 0)

X[['Ticket', 'TicketFirstCharacter', 'TicketFirstCharacter3']].head()

Unnamed: 0,Ticket,TicketFirstCharacter,TicketFirstCharacter3
0,A/5 21171,A,0
1,PC 17599,P,0
2,STON/O2. 3101282,S,0
3,113803,1,0
4,373450,3,1


In [204]:
# Feature engineering - Cabin letter

X['CabinCategory'] = X['Cabin'].apply(lambda x: str(x)[0])

X['CabinCategory'] = X['CabinCategory'].replace('n', '')

X['CabinA'] = np.where(X['CabinCategory'] == 'A', 1, 0)
X['CabinB'] = np.where(X['CabinCategory'] == 'B', 1, 0)
X['CabinC'] = np.where(X['CabinCategory'] == 'C', 1, 0)
X['CabinD'] = np.where(X['CabinCategory'] == 'D', 1, 0)
X['CabinE'] = np.where(X['CabinCategory'] == 'E', 1, 0)
X['CabinF'] = np.where(X['CabinCategory'] == 'F', 1, 0)

X['CabinMissing'] = np.where(X['Cabin'].isna(), 1, 0)

X[['Cabin', 'CabinCategory', 'CabinC', 'CabinMissing']].head(7)

Unnamed: 0,Cabin,CabinCategory,CabinC,CabinMissing
0,,,0,1
1,C85,C,1,0
2,,,0,1
3,C123,C,1,0
4,,,0,1
5,,,0,1
6,E46,E,0,0


In [205]:
# Feature engineering - Cabin number

X['CabinNumber'] = X['Cabin'].apply(lambda x: str(x).split(' ')[-1])
X['CabinNumber'] = np.where(X['CabinNumber'] == 'nan', np.nan, X['CabinNumber'].str[1:])
X['CabinNumber'] = pd.to_numeric(X['CabinNumber'])
X['CabinNumberLow'] = np.where(X['CabinNumber'] <= 30, 1, 0)
X['CabinNumberMed'] = np.where((X['CabinNumber'] > 30) & (X['CabinNumber'] <= 63), 1, 0)
X['CabinNumberHigh'] = np.where(X['CabinNumber'] > 63, 1, 0)

X[['Cabin', 'CabinNumber', 'CabinNumberMed', 'CabinNumberHigh']].head(7)

Unnamed: 0,Cabin,CabinNumber,CabinNumberMed,CabinNumberHigh
0,,,0,0
1,C85,85.0,0,1
2,,,0,0
3,C123,123.0,0,1
4,,,0,0
5,,,0,0
6,E46,46.0,1,0


In [206]:
# Replace missing values for Embarked with mode

print('Missing values before:', X[['Embarked']].isna().sum())

X['Embarked'] = X['Embarked'].fillna('S')

print('Missing values after:', X[['Embarked']].isna().sum())

Missing values before: Embarked    2
dtype: int64
Missing values after: Embarked    0
dtype: int64


In [207]:
# Feature engineering - Embarked

X['EmbarkedC'] = np.where(X['Embarked'] == 'C', 1, 0)
X['EmbarkedQ'] = np.where(X['Embarked'] == 'Q', 1, 0)
X['EmbarkedS'] = np.where(X['Embarked'] == 'S', 1, 0)

X[['Embarked', 'EmbarkedC']].head()

Unnamed: 0,Embarked,EmbarkedC
0,S,0
1,C,1
2,S,0
3,S,0
4,S,0


In [208]:
# Replace missing age with training set median (by title and class)

print('Missing values before:', X[['Age']].isna().sum())

X['Age'] = X[X['Label'] == 'train'].groupby(['Title', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))

print('Missing values after:', X[['Age']].isna().sum())

Missing values before: Age    263
dtype: int64
Missing values after: Age    0
dtype: int64


In [209]:
# Replace missing fare with training set median (by class)

print('Missing values before:', X[['Fare']].isna().sum())

median_fare = X['Fare'][(X['Label'] == 'train') & (X['Pclass'] == 3)].median()
X['Fare'] = X['Fare'].fillna(median_fare)

print('Missing values after:', X[['Fare']].isna().sum())

Missing values before: Fare    1
dtype: int64
Missing values after: Fare    0
dtype: int64


In [210]:
# Split data into train and test

X_train = X[X['Label'] == 'train']
X_test = X[X['Label'] == 'test']

In [211]:
# Save features matrix (prior to dropping unused features) to csv

X_train.to_csv('intermediary_outputs/X_train_all.csv')

X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Label,FirstClass,SecondClass,Title,Mr,Miss,Mrs,Master,NameLength,Female,AgeMissing,Child,FamilySize,FamilySizeCat,FamilySizeSmall,FamilySizeBig,TicketLength,TicketFirstCharacter,TicketFirstCharacter3,TicketFirstCharacter2,TicketFirstCharacter1,TicketFirstCharacterP,TicketFirstCharacterS,TicketFirstCharacterC,TicketFirstCharacterA,TicketFirstCharacterW,TicketFirstCharacter4,CabinCategory,CabinA,CabinB,CabinC,CabinD,CabinE,CabinF,CabinMissing,CabinNumber,CabinNumberLow,CabinNumberMed,CabinNumberHigh,EmbarkedC,EmbarkedQ,EmbarkedS
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,0,0,Mr,1,0,0,0,23,0,0,0,2,Small,1,0,9,A,0,0,0,0,0,0,1,0,0,,0,0,0,0,0,0,1,,0,0,0,0,0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,1,0,Mrs,0,0,1,0,51,1,0,0,2,Small,1,0,8,P,0,0,0,1,0,0,0,0,0,C,0,0,1,0,0,0,0,85.0,0,0,1,1,0,0
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,0,0,Miss,0,1,0,0,22,1,0,0,1,Alone,0,0,16,S,0,0,0,0,1,0,0,0,0,,0,0,0,0,0,0,1,,0,0,0,0,0,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,1,0,Mrs,0,0,1,0,44,1,0,0,2,Small,1,0,6,1,0,0,1,0,0,0,0,0,0,C,0,0,1,0,0,0,0,123.0,0,0,1,0,0,1
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,0,0,Mr,1,0,0,0,24,0,0,0,1,Alone,0,0,6,3,1,0,0,0,0,0,0,0,0,,0,0,0,0,0,0,1,,0,0,0,0,0,1


In [212]:
# Drop unused features

X_train = X_train.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Label', 'Title',
                        'FamilySize', 'FamilySizeCat', 'TicketFirstCharacter', 'CabinCategory', 'CabinNumber'], axis=1)
X_test = X_test.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Label', 'Title',
                      'FamilySize', 'FamilySizeCat', 'TicketFirstCharacter', 'CabinCategory', 'CabinNumber'], axis=1)

In [213]:
# Count and print features used

features_used = X_train.columns.tolist()

print('Number of features used:', len(features_used))
print('Features used:', features_used)

Number of features used: 37
Features used: ['Age', 'Fare', 'FirstClass', 'SecondClass', 'Mr', 'Miss', 'Mrs', 'Master', 'NameLength', 'Female', 'AgeMissing', 'Child', 'FamilySizeSmall', 'FamilySizeBig', 'TicketLength', 'TicketFirstCharacter3', 'TicketFirstCharacter2', 'TicketFirstCharacter1', 'TicketFirstCharacterP', 'TicketFirstCharacterS', 'TicketFirstCharacterC', 'TicketFirstCharacterA', 'TicketFirstCharacterW', 'TicketFirstCharacter4', 'CabinA', 'CabinB', 'CabinC', 'CabinD', 'CabinE', 'CabinF', 'CabinMissing', 'CabinNumberLow', 'CabinNumberMed', 'CabinNumberHigh', 'EmbarkedC', 'EmbarkedQ', 'EmbarkedS']


In [214]:
# Save features matrix (after dropping unused features) to csv

X_train.to_csv('intermediary_outputs/X_train.csv', index=False)
X_test.to_csv('intermediary_outputs/X_test.csv', index=False)

X_train.head()

Unnamed: 0,Age,Fare,FirstClass,SecondClass,Mr,Miss,Mrs,Master,NameLength,Female,AgeMissing,Child,FamilySizeSmall,FamilySizeBig,TicketLength,TicketFirstCharacter3,TicketFirstCharacter2,TicketFirstCharacter1,TicketFirstCharacterP,TicketFirstCharacterS,TicketFirstCharacterC,TicketFirstCharacterA,TicketFirstCharacterW,TicketFirstCharacter4,CabinA,CabinB,CabinC,CabinD,CabinE,CabinF,CabinMissing,CabinNumberLow,CabinNumberMed,CabinNumberHigh,EmbarkedC,EmbarkedQ,EmbarkedS
0,22.0,7.25,0,0,1,0,0,0,23,0,0,0,1,0,9,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,38.0,71.2833,1,0,0,0,1,0,51,1,0,0,1,0,8,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0
2,26.0,7.925,0,0,0,1,0,0,22,1,0,0,0,0,16,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,35.0,53.1,1,0,0,0,1,0,44,1,0,0,1,0,6,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
4,35.0,8.05,0,0,1,0,0,0,24,0,0,0,0,0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
