In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv("/Users/joshndirangu/Documents/titanic-eda/data/cleaned_titanic.csv")

FEATURE ENGINEERING 

In [12]:
# Create 'Family Size' column
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # +1 includes the passenger

In [4]:
df.info() # 'Family Size' added successfully

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Fare         891 non-null    float64
 9   Embarked     891 non-null    object 
 10  FamilySize   891 non-null    int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 76.7+ KB


In [13]:
# Create a column indicating if passenger is alone on board
df['IsAlone'] = (df['FamilySize'] == 1).astype(int) # 0-true, 1-false

In [14]:
# Extracting the titles of the passengers on board

df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [15]:
# Categorizes the various titles that are not common across the dataset as rare 

df['Title'] = df['Title'].replace(
    ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 
     'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

In [16]:
# Standardizing similar titles (other language)
df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})

In [17]:
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title
733,734,0,2,"Berriman, Mr. William John",1,23.0,0,0,13.0,2,1,1,Mr
601,602,0,3,"Slabenoff, Mr. Petco",1,28.0,0,0,7.8958,2,1,1,Mr
691,692,1,3,"Karun, Miss. Manca",0,4.0,0,1,13.4167,0,2,0,Miss
872,873,0,1,"Carlsson, Mr. Frans Olof",1,33.0,0,0,5.0,2,1,1,Mr
371,372,0,3,"Wiklund, Mr. Jakob Alfred",1,18.0,1,0,6.4958,2,2,0,Mr
333,334,0,3,"Vander Planke, Mr. Leo Edmondus",1,16.0,2,0,18.0,2,3,0,Mr
763,764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",0,36.0,1,2,120.0,2,4,0,Mrs
86,87,0,3,"Ford, Mr. William Neal",1,16.0,1,3,34.375,2,5,0,Mr
749,750,0,3,"Connaghton, Mr. Michael",1,31.0,0,0,7.75,1,1,1,Mr
309,310,1,1,"Francatelli, Miss. Laura Mabel",0,30.0,0,0,56.9292,0,1,1,Miss


In [11]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
FamilySize     0
IsAlone        0
Title          0
dtype: int64

In [18]:
# Converting numerical age data into categoriacal aege buckets

df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 80], 
                        labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])

In [21]:
# Binning fare into Quartiles 
# Categorizes the data  into 4 equal labelled datasets

df['FareBand'] = pd.qcut(df['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh']) 

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    object  
 4   Sex          891 non-null    int64   
 5   Age          891 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Fare         891 non-null    float64 
 9   Embarked     891 non-null    int64   
 10  FamilySize   891 non-null    int64   
 11  IsAlone      891 non-null    int64   
 12  Title        891 non-null    int64   
 13  AgeGroup     891 non-null    int64   
 14  FareBand     891 non-null    category
dtypes: category(1), float64(2), int64(11), object(1)
memory usage: 98.6+ KB


In [23]:
# Installing sci-kit learn for the following functinaities
# Splitting data, Modeling, Evaluation metrics

!pip install --upgrade --timeout=300 --retries=5 --no-cache-dir scikit-learn



In [24]:
# COnverting string / object into numbers for modelling
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
df['Sex'] = label.fit_transform(df['Sex'])           # male=1, female=0
df['Embarked'] = label.fit_transform(df['Embarked']) # C=0, Q=1, S=2
df['Title'] = label.fit_transform(df['Title'])
df['AgeGroup'] = label.fit_transform(df['AgeGroup'].astype(str))  # convert to str first
df['FareBand'] = label.fit_transform(df['FareBand'].astype(str))

In [25]:
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeGroup,FareBand
303,304,1,2,"Keane, Miss. Nora A",0,28.0,0,0,12.35,1,1,1,1,4,2
779,780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton Mc...",0,43.0,0,1,211.3375,2,2,0,3,0,3
16,17,0,3,"Rice, Master. Eugene",1,2.0,4,1,29.125,1,6,0,0,1,0
345,346,1,2,"Brown, Miss. Amelia ""Mildred""",0,24.0,0,0,13.0,2,1,1,1,4,2
653,654,1,3,"O'Leary, Miss. Hanora ""Norah""",0,28.0,0,0,7.8292,1,1,1,1,4,1
813,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",0,6.0,4,2,31.275,2,7,0,1,1,3
369,370,1,1,"Aubart, Mme. Leontine Pauline",0,24.0,0,0,69.3,0,1,1,3,4,3
616,617,0,3,"Danbom, Mr. Ernst Gilbert",1,34.0,1,1,14.4,2,3,0,2,4,2
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,13.0,2,1,1,4,4,2
32,33,1,3,"Glynn, Miss. Mary Agatha",0,28.0,0,0,7.75,1,1,1,1,4,1


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Fare         891 non-null    float64
 9   Embarked     891 non-null    int64  
 10  FamilySize   891 non-null    int64  
 11  IsAlone      891 non-null    int64  
 12  Title        891 non-null    int64  
 13  AgeGroup     891 non-null    int64  
 14  FareBand     891 non-null    int64  
dtypes: float64(2), int64(12), object(1)
memory usage: 104.5+ KB


In [28]:
# Dropping unecessary columns
df.drop(['Name'], axis=1, inplace=True)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    int64  
 9   FamilySize   891 non-null    int64  
 10  IsAlone      891 non-null    int64  
 11  Title        891 non-null    int64  
 12  AgeGroup     891 non-null    int64  
 13  FareBand     891 non-null    int64  
dtypes: float64(2), int64(12)
memory usage: 97.6 KB


In [30]:
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,AgeGroup,FareBand
498,499,0,1,0,25.0,1,2,151.55,2,4,0,3,4,3
18,19,0,3,0,31.0,1,0,18.0,2,2,0,3,4,0
400,401,1,3,1,39.0,0,0,7.925,2,1,1,2,0,2
516,517,1,2,0,34.0,0,0,10.5,2,1,1,3,4,2
878,879,0,3,1,28.0,0,0,7.8958,2,1,1,2,4,1
528,529,0,3,1,39.0,0,0,7.925,2,1,1,2,0,2
626,627,0,2,1,57.0,0,0,12.35,1,1,1,4,0,2
490,491,0,3,1,28.0,1,0,19.9667,2,2,0,2,4,0
155,156,0,1,1,51.0,0,1,61.3792,0,2,0,2,0,3
689,690,1,1,0,15.0,0,1,211.3375,2,2,0,1,3,3


In [32]:
# To load the engineered data into a new separate file

df.to_csv('/Users/joshndirangu/Documents/titanic-eda/data/engineered_titanic.csv', index=False)