In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

## Titanic Dataset

The aim of this workbook is to analyse the impact that the "first class" and "women and children" evacuation policy may have had on the survivorship of boarders on the Titanic.

## 1. Data Preprocessing

In [3]:
columns_to_remove=[]

### 1.1 Data Loading And Analysis

In [4]:
train_df=pd.read_csv('train.csv')
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### 1.2 Analyze and Replace / Remove Missing Data

In [5]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
train_df.isna().sum() * 100 / len(train_df)

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

Three columns are mising values. Most significant missing variables are the below:

* 1. Cabin: over 77% missing. is it even worth keeping given how many are missing.
* 2. Age: this is quite a significant missing dataset, given that age (combined with gender) were a big factor in whether you were evacuated first.
* 3. Embarked: less than 1%. Extrapolate.

Makes sense to just remove the cabin columns, so will add it to the list of columns_to_remove.

In [7]:
columns_to_remove.append('cabin')

### 1.3 Replace Missing Age Values

https://www.kaggle.com/code/allohvk/titanic-missing-age-imputation-tutorial-advanced

In [16]:
no_age = train_df.loc[train_df['Age'].isna()]
no_age

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


There are a number of ways to do this. 

* 1. Just apply a blanket average to the missing values. However, given that I intend to investigate the role that the "women and children" evacuation policy had, I will need to come up with a better approach.
* 2. Take averages based on a number of columns: use gender, titles, parch to narrow down the expected age of the missing values.

What factors can help to determine the age of the missing data set:

* 1. Sibsp / Parch
* 2. Title?
* 3. Ticket number

In [24]:
missing_age_ticket_numbers_set=set(no_age['Ticket'].to_list())
missing_age_ticket_numbers_set

{'110465',
 '111427',
 '112052',
 '112058',
 '112379',
 '113028',
 '113056',
 '113505',
 '113510',
 '113767',
 '113796',
 '113798',
 '11774',
 '12460',
 '14311',
 '14312',
 '14313',
 '1601',
 '16988',
 '17421',
 '17453',
 '17464',
 '19947',
 '19988',
 '19996',
 '226593',
 '239853',
 '239854',
 '239855',
 '239856',
 '244373',
 '248727',
 '2624',
 '2626',
 '2627',
 '2629',
 '2631',
 '2641',
 '2647',
 '2649',
 '2661',
 '2662',
 '2664',
 '2665',
 '2668',
 '2671',
 '2674',
 '2677',
 '2678',
 '2686',
 '2689',
 '2700',
 '312991',
 '312993',
 '315037',
 '323592',
 '330877',
 '330909',
 '330919',
 '330931',
 '330932',
 '330935',
 '330959',
 '330979',
 '330980',
 '334912',
 '335677',
 '3411',
 '343095',
 '345777',
 '349201',
 '349208',
 '349214',
 '349215',
 '349216',
 '349217',
 '349218',
 '349221',
 '349222',
 '349223',
 '349225',
 '349227',
 '349234',
 '349253',
 '349254',
 '35852',
 '358585',
 '36209',
 '362316',
 '364498',
 '364848',
 '364851',
 '36568',
 '367226',
 '367228',
 '367229',
 '3

In [31]:
print(f"len {len(missing_age_ticket_numbers_set)}")

len 155


In [30]:
ticket = train_df.loc[train_df['Ticket'].isin(missing_age_ticket_numbers_set)]
ticket

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [33]:
non_na_ticket=ticket.dropna()
non_na_ticket

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
110,111,0,1,"Porter, Mr. Walter Chamberlain",male,47.0,0,0,110465,52.0,C110,S
195,196,1,1,"Lurette, Miss. Elise",female,58.0,0,0,PC 17569,146.5208,B80,C
356,357,1,1,"Bowerman, Miss. Elsie Edith",female,22.0,0,1,113505,55.0,E33,S
453,454,1,1,"Goldenberg, Mr. Samuel L",male,49.0,1,0,17453,89.1042,C92,C
550,551,1,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C70,C
581,582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth M...",female,39.0,1,1,17421,110.8833,C68,C
698,699,0,1,"Thayer, Mr. John Borland",male,49.0,1,1,17421,110.8833,C68,C
700,701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18.0,1,0,PC 17757,227.525,C62 C64,C
712,713,1,1,"Taylor, Mr. Elmer Zebley",male,48.0,1,0,19996,52.0,C126,S
716,717,1,1,"Endres, Miss. Caroline Louise",female,38.0,0,0,PC 17757,227.525,C45,C


In [34]:
print(len(non_na_ticket))

10


In [9]:
parch = train_df.loc[(train_df['Parch'] > 0) | (train_df['SibSp'] > 0)]
parch

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q


In [10]:
parch['Age'].isna().sum()

44

In [11]:
none_parch = train_df.loc[((train_df['Parch'] == 0) & (train_df['SibSp'] == 0))]

In [12]:
none_parch['Age'].isna().sum()

133