## Titanic Survival Rate Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
#import the dataset
url = 'https://raw.githubusercontent.com/LarryChenCode/uwaterloo/main/train.csv'
tr = pd.read_csv(url)
tr

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
#To see the missing value of each column
tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
#Three attributes has missing value: Age, Cabin, and Embarked
#Age has 177 missing value, but I want to use age to see whether there is any relationship with survival rate, so I use mean age to replace the NaN
mean_age = tr['Age'].mean()
tr['Age'].fillna(mean_age, inplace=True)

#Embarked has 2 missing values, which is small portion, so I drop those 2 rows
tr.dropna(subset=['Embarked'], inplace=True)

#Cabin has too many missing value, so I drop whole column
tr.drop('Cabin', axis=1, inplace=True)

In [None]:
#Drop the columns that might not be useful for the analysis
#Drop Passenger Id and Name, which might not be  very useful for the analysis
tr.drop(['PassengerId', 'Name'], axis=1, inplace=True)

#Drop Fare columns
tr.drop('Fare', axis=1, inplace=True)

#Drop Ticket columns
tr.drop('Ticket', axis=1, inplace=True)

In [None]:
#Create a new column Family_size to add sibsp and parch
tr['Family_Size'] = tr['SibSp'] + tr['Parch']

In [None]:
#Tranform "Sex" column into binary by check whether the value is female (true, 1) or male (flase, 0)
tr['Female'] = tr['Sex'].apply(lambda x: 1 if x == 'female' else 0)

sex_col = tr.pop('Sex')
tr.insert(len(tr.columns), 'Sex', sex_col)

In [None]:
#Drop the Embarked column and add it to the last conlumn after Family_Size column
emb_col = tr.pop('Embarked')
tr.insert(len(tr.columns), 'Embarked', emb_col)

In [None]:
#Create a new column age_type to classify the passenger by age
#Children (0-12), Teenager(13-18), Adult(19-65), and elder(66+)
def age_type(x):
    if x <= 12:
        return 0 #children
    elif x > 12 and x <= 18:
        return 1 #Teenager
    elif x > 18 and x <= 65:
        return 2 #Adult
    else:
        return 3 #Elder

tr['Age_type'] = tr['Age'].apply(age_type)
age_col = tr.pop('Age')
tr.insert(len(tr.columns), 'Age', age_col)

In [None]:
#Check the info after conducting above action
tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Survived     889 non-null    int64  
 1   Pclass       889 non-null    int64  
 2   SibSp        889 non-null    int64  
 3   Parch        889 non-null    int64  
 4   Family_Size  889 non-null    int64  
 5   Female       889 non-null    int64  
 6   Sex          889 non-null    object 
 7   Embarked     889 non-null    object 
 8   Age_type     889 non-null    int64  
 9   Age          889 non-null    float64
dtypes: float64(1), int64(7), object(2)
memory usage: 76.4+ KB


In [None]:
tr

Unnamed: 0,Survived,Pclass,SibSp,Parch,Family_Size,Female,Sex,Embarked,Age_type,Age
0,0,3,1,0,1,0,male,S,2,22.000000
1,1,1,1,0,1,1,female,C,2,38.000000
2,1,3,0,0,0,1,female,S,2,26.000000
3,1,1,1,0,1,1,female,S,2,35.000000
4,0,3,0,0,0,0,male,S,2,35.000000
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,0,0,0,male,S,2,27.000000
887,1,1,0,0,0,1,female,S,2,19.000000
888,0,3,1,2,3,1,female,S,2,29.699118
889,1,1,0,0,0,0,male,C,2,26.000000


In [None]:
#to see the correlation between each variable
tr.corr()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Family_Size,Female,Age_type,Age
Survived,1.0,-0.335549,-0.03404,0.083151,0.018277,0.541585,-0.12763,-0.074673
Pclass,-0.335549,1.0,0.081656,0.016824,0.064221,-0.127741,-0.141073,-0.327954
SibSp,-0.03404,0.081656,1.0,0.414542,0.890654,0.116348,-0.342752,-0.231875
Parch,0.083151,0.016824,0.414542,1.0,0.782988,0.247508,-0.353781,-0.178232
Family_Size,0.018277,0.064221,0.890654,0.782988,1.0,0.203191,-0.411044,-0.247546
Female,0.541585,-0.127741,0.116348,0.247508,0.203191,1.0,-0.119164,-0.089434
Age_type,-0.12763,-0.141073,-0.342752,-0.353781,-0.411044,-0.119164,1.0,0.689564
Age,-0.074673,-0.327954,-0.231875,-0.178232,-0.247546,-0.089434,0.689564,1.0


In [None]:
#Survivial rate by Pclass
pclass_survival_rates = tr.groupby('Pclass')['Survived'].mean()
pclass_survival_rates

Pclass
1    0.626168
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [None]:
#Survivial rate by Sex
sex_survival_rates = tr.groupby('Sex')['Survived'].mean()
sex_survival_rates

Sex
female    0.740385
male      0.188908
Name: Survived, dtype: float64

In [None]:
#Survivial rate by Embarked
emb_survival_rates = tr.groupby('Embarked')['Survived'].mean()
emb_survival_rates

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

In [None]:
#Survivial rate by SibSp
sib_survival_rates = tr.groupby('SibSp')['Survived'].mean()
sib_survival_rates

SibSp
0    0.343234
1    0.535885
2    0.464286
3    0.250000
4    0.166667
5    0.000000
8    0.000000
Name: Survived, dtype: float64

In [None]:
#Survivial rate by Parch
parch_survival_rates = tr.groupby('Parch')['Survived'].mean()
parch_survival_rates

Parch
0    0.341716
1    0.550847
2    0.500000
3    0.600000
4    0.000000
5    0.200000
6    0.000000
Name: Survived, dtype: float64

In [None]:
#Survivial rate by Family_Size
famsiz_survival_rates = tr.groupby('Family_Size')['Survived'].mean()
famsiz_survival_rates

Family_Size
0     0.300935
1     0.552795
2     0.578431
3     0.724138
4     0.200000
5     0.136364
6     0.333333
7     0.000000
10    0.000000
Name: Survived, dtype: float64

In [None]:
#Survivial rate by Age_type
agety_survival_rates = tr.groupby('Age_type')['Survived'].mean()
agety_survival_rates

Age_type
0    0.579710
1    0.428571
2    0.362534
3    0.125000
Name: Survived, dtype: float64