In [114]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 


In [115]:
df = pd.read_csv("/Users/aryan/Desktop/Titanic-Dataset.csv")

In [116]:
print(df.shape)

(891, 12)


In [117]:
### Feature Description
# - Survived: Binary outcome (0 = No, 1 = Yes)
# - Pclass: Passenger class (1 = First, 3 = Third)
# - Sex: Encoded variable (0 = Male, 1 = Female)
# - Age: Passenger age in years
# - SibSp: Number of siblings/spouses aboard
# - Parch: Number of parents/children aboard
# - Fare: Ticket price
# - Embarked: Port of embarkation (0 = S, 1 = C, 2 = Q)

In [5]:
df['Survived'].value_counts()
df['Survived'].value_counts(normalize=True)

Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64

In [118]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df = df.drop(columns=['Cabin', 'PassengerId', 'Name', 'Ticket'], errors='ignore')
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [119]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.361582,0.523008,0.381594,32.204208,0.361392
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.635673
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,0.0
50%,0.0,3.0,0.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [120]:
df['Survived'].value_counts()


Survived
0    549
1    342
Name: count, dtype: int64

In [131]:
overall_survival = (
    df['Survived']
    .value_counts(normalize=True)
    .mul(100)
    .rename(index={0: 'Did Not Survive', 1: 'Survived'})
    .to_frame(name='Percentage (%)')
    .round(2)
)

overall_survival

Unnamed: 0_level_0,Percentage (%)
Survived,Unnamed: 1_level_1
Did Not Survive,61.62
Survived,38.38


In [121]:
pd.crosstab(df['Sex'], df['Survived'])


Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,468,109
1,81,233


In [122]:
pd.crosstab(df['Sex'], df['Survived'], normalize='index') * 100

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81.109185,18.890815
1,25.796178,74.203822


In [123]:
pd.crosstab(df['Pclass'], df['Survived'], normalize='index') * 100

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,37.037037,62.962963
2,52.717391,47.282609
3,75.763747,24.236253


In [124]:
df['AgeGroup'] = pd.cut(
    df['Age'],
    bins=[0, 12, 18, 35, 60, 100],
    labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])
pd.crosstab(df['AgeGroup'], df['Survived'], normalize='index') * 100

Survived,0,1
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1
Child,42.028986,57.971014
Teen,57.142857,42.857143
Young Adult,64.672897,35.327103
Adult,60.0,40.0
Senior,77.272727,22.727273


In [125]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
pd.crosstab(df['FamilySize'], df['Survived'], normalize='index') * 10


Survived,0,1
FamilySize,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6.964618,3.035382
2,4.47205,5.52795
3,4.215686,5.784314
4,2.758621,7.241379
5,8.0,2.0
6,8.636364,1.363636
7,6.666667,3.333333
8,10.0,0.0
11,10.0,0.0


In [126]:
summary = df.groupby('Pclass').agg({
    'Survived': 'mean',
    'Age': 'mean',
    'Fare': 'mean'})

summary

Unnamed: 0_level_0,Survived,Age,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.62963,36.81213,84.154687
2,0.472826,29.76538,20.662183
3,0.242363,25.932627,13.67555


In [127]:
survival_by_sex = (
    pd.crosstab(df['Sex'], df['Survived'], normalize='index') * 100
).round(2)

survival_by_sex.index = ['Male', 'Female']
survival_by_sex.columns = ['Did Not Survive (%)', 'Survived (%)']

survival_by_sex

Unnamed: 0,Did Not Survive (%),Survived (%)
Male,81.11,18.89
Female,25.8,74.2


In [128]:
Summary_stats = df[['Age', 'Fare']].describe().round(2)
Summary_stats

Unnamed: 0,Age,Fare
count,891.0,891.0
mean,29.36,32.2
std,13.02,49.69
min,0.42,0.0
25%,22.0,7.91
50%,28.0,14.45
75%,35.0,31.0
max,80.0,512.33


In [129]:
key_findings = pd.DataFrame({
    'Insight': ['Overall survival rate','Female survival rate','Male survival rate','First class survival rate','Third class survival rate'],
    'Value (%)': [df['Survived'].mean() * 100, survival_by_sex.loc['Female', 'Survived (%)'], survival_by_sex.loc['Male', 'Survived (%)'],summary.loc[1, 'Survived'] * 100,summary.loc[3, 'Survived'] * 100]}).round(2)

key_findings


Unnamed: 0,Insight,Value (%)
0,Overall survival rate,38.38
1,Female survival rate,74.2
2,Male survival rate,18.89
3,First class survival rate,62.96
4,Third class survival rate,24.24


In [130]:
overall_survival = (
    df['Survived']
    .value_counts(normalize=True)
    .mul(100)
    .rename(index={0: 'Did Not Survive', 1: 'Survived'})
    .to_frame(name='Percentage (%)')
    .round(2)
)

overall_survival

Unnamed: 0_level_0,Percentage (%)
Survived,Unnamed: 1_level_1
Did Not Survive,61.62
Survived,38.38
