In [99]:
# Inspired by https://www.kaggle.com/code/zlatankr/titanic-random-forest-82-78

In [100]:
# Imports

import pandas as pd
import numpy as np

In [101]:
# Load data

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [102]:
# Summary of training data

train.info(memory_usage=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)

In [103]:
# Glance at data

train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [104]:
# Percentage of people who died and survived

train['Survived'].value_counts(normalize=True).round(2)

Survived
0    0.62
1    0.38
Name: proportion, dtype: float64

In [105]:
# Survival per class

train['Survived'].groupby(train['Pclass']).mean().round(2)

Pclass
1    0.63
2    0.47
3    0.24
Name: Survived, dtype: float64

In [106]:
# Survival by title

train['Title'] = train['Name'].str.extract('([A-Za-z]+)\.', expand=False)

train['Survived'].groupby(train['Title']).agg(['count', 'mean']).round(2).reset_index().sort_values('count', ascending=False)

Unnamed: 0,Title,count,mean
12,Mr,517,0.16
9,Miss,182,0.7
13,Mrs,125,0.79
8,Master,40,0.57
4,Dr,7,0.43
15,Rev,6,0.0
7,Major,2,0.5
1,Col,2,0.5
10,Mlle,2,1.0
11,Mme,1,1.0


In [107]:
# Survival by name length

train['NameLength'] = train['Name'].str.len()
train['Survived'].groupby(pd.qcut(train['NameLength'], 5)).mean().round(2)

NameLength
(11.999, 19.0]    0.22
(19.0, 23.0]      0.30
(23.0, 27.0]      0.32
(27.0, 32.0]      0.44
(32.0, 82.0]      0.67
Name: Survived, dtype: float64

In [108]:
# Survival by sex

train['Survived'].groupby(train['Sex']).mean().round(2)

Sex
female    0.74
male      0.19
Name: Survived, dtype: float64

In [109]:
# Relationship between survival and whether age data is missing

train['Survived'].groupby(train['Age'].notna()).mean().round(2)

Age
False    0.29
True     0.41
Name: Survived, dtype: float64

In [110]:
# Survival by age

train['Survived'].groupby(pd.qcut(train['Age'], 5)).mean().round(2)

Age
(0.419, 19.0]    0.48
(19.0, 25.0]     0.33
(25.0, 31.8]     0.39
(31.8, 41.0]     0.44
(41.0, 80.0]     0.37
Name: Survived, dtype: float64

In [111]:
# Survival by number of siblings on board

train['Survived'].groupby(train['SibSp']).agg(['count', 'mean']).round(2)

Unnamed: 0_level_0,count,mean
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,608,0.35
1,209,0.54
2,28,0.46
3,16,0.25
4,18,0.17
5,5,0.0
8,7,0.0


In [112]:
# Survival by number of parents/children on board

train['Survived'].groupby(train['Parch']).agg(['count', 'mean']).round(2)

Unnamed: 0_level_0,count,mean
Parch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,678,0.34
1,118,0.55
2,80,0.5
3,5,0.6
4,4,0.0
5,5,0.2
6,1,0.0


In [113]:
# Survival by family size

train['FamilySize'] = train['Parch'] + train['SibSp'] + 1
train['Survived'].groupby(train['FamilySize']).agg(['count', 'mean']).round(2)

Unnamed: 0_level_0,count,mean
FamilySize,Unnamed: 1_level_1,Unnamed: 2_level_1
1,537,0.3
2,161,0.55
3,102,0.58
4,29,0.72
5,15,0.2
6,22,0.14
7,12,0.33
8,6,0.0
11,7,0.0


In [114]:
# Survival by length of ticket number

train['TicketLength'] = train['Ticket'].str.len()
train['Survived'].groupby(train['TicketLength']).agg(['count', 'mean']).round(2)

Unnamed: 0_level_0,count,mean
TicketLength,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2,0.0
4,101,0.37
5,131,0.62
6,419,0.32
7,27,0.3
8,76,0.54
9,26,0.19
10,41,0.34
11,8,0.25
12,10,0.4


In [115]:
# Survival by first character of ticket number

train['TicketFirstCharacter'] = train['Ticket'].str[0]
train['Survived'].groupby(train['TicketFirstCharacter']).agg(['count', 'mean']).round(2)

Unnamed: 0_level_0,count,mean
TicketFirstCharacter,Unnamed: 1_level_1,Unnamed: 2_level_1
1,146,0.63
2,183,0.46
3,301,0.24
4,10,0.2
5,3,0.0
6,6,0.17
7,9,0.11
8,2,0.0
9,1,1.0
A,29,0.07


In [116]:
# Survival by fare

train['Survived'].groupby(pd.qcut(train['Fare'], 3)).mean().round(2)

Fare
(-0.001, 8.662]    0.20
(8.662, 26.0]      0.40
(26.0, 512.329]    0.56
Name: Survived, dtype: float64

In [117]:
# Relationship between survival and whether cabin data is missing

train['Survived'].groupby(train['Cabin'].notna()).mean().round(2)

Cabin
False    0.30
True     0.67
Name: Survived, dtype: float64

In [118]:
# Survival by cabin letter

train_not_na = train.dropna(subset='Cabin').copy()
train_not_na['CabinCategory'] = train_not_na['Cabin'].apply(lambda x: str(x)[0])
train_not_na['Survived'].groupby(train_not_na['CabinCategory']).agg(['count', 'mean']).round(2)

Unnamed: 0_level_0,count,mean
CabinCategory,Unnamed: 1_level_1,Unnamed: 2_level_1
A,15,0.47
B,47,0.74
C,59,0.59
D,33,0.76
E,32,0.75
F,13,0.62
G,4,0.5
T,1,0.0


In [119]:
# Survival by cabin number

train['CabinNumber'] = train['Cabin'].apply(lambda x: str(x).split(' ')[-1])
train['CabinNumber'] = np.where(train['CabinNumber'] == 'nan', np.nan, train['CabinNumber'].str[1:])
train['CabinNumber'] = pd.to_numeric(train['CabinNumber'])
train['Survived'].groupby(pd.qcut(train['CabinNumber'], 3)).mean().round(2)

CabinNumber
(1.999, 28.667]     0.72
(28.667, 65.667]    0.65
(65.667, 148.0]     0.64
Name: Survived, dtype: float64

In [120]:
# Survival by embarkment location

train['Survived'].groupby(train['Embarked']).mean().round(2)

Embarked
C    0.55
Q    0.39
S    0.34
Name: Survived, dtype: float64

In [121]:
# Age by class

train['Age'].groupby(train['Pclass']).mean().round(1)

Pclass
1    38.2
2    29.9
3    25.1
Name: Age, dtype: float64

In [122]:
# Age by sex

train['Age'].groupby(train['Sex']).mean().round(1)

Sex
female    27.9
male      30.7
Name: Age, dtype: float64

In [123]:
# Age by title

train['Age'].groupby(train['Title']).agg(['count', 'mean']).round(1)

Unnamed: 0_level_0,count,mean
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,70.0
Col,2,58.0
Countess,1,33.0
Don,1,40.0
Dr,6,42.0
Jonkheer,1,38.0
Lady,1,48.0
Major,2,48.5
Master,36,4.6
Miss,146,21.8


In [124]:
# Age by title and class

train.groupby(['Title', 'Pclass'])['Age'].agg(['count', 'mean']).round(1).sort_values(['Title', 'Pclass'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean
Title,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
Capt,1,1,70.0
Col,1,2,58.0
Countess,1,1,33.0
Don,1,1,40.0
Dr,1,4,43.8
Dr,2,2,38.5
Jonkheer,1,1,38.0
Lady,1,1,48.0
Major,1,2,48.5
Master,1,3,5.3


In [125]:
# Fare by class

train['Fare'].groupby(train['Pclass']).mean().round(1)

Pclass
1    84.2
2    20.7
3    13.7
Name: Fare, dtype: float64

In [126]:
# Embarked value counts

train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64