In [None]:
!git clone https://github.com/GeeksforgeeksDS/21-Days-21-Projects-Dataset.git

#Data Storytelling: Analysing Survival on the **Titanic**

In [None]:
import numpy as np
import pandas as pd

In [None]:
df= pd.read_csv("/content/21-Days-21-Projects-Dataset/Datasets/Titanic-Dataset.csv")

In [None]:
df

In [None]:
df.head() #top 5 rows

In [None]:
df.tail() #bottom 5 rows

In [None]:
df.describe()

In [None]:
df.info()

Here, we can see there are null values in the Age, Cabin, and Embarked Columns. We need to fix

In [None]:
df.isnull().sum()

Numerical Column - Median ∇ Categorical Column - Mode ∇ Too many missing values- Drop or Create New Column

Cleaning The **Data**

In [None]:
#Cleaning Age
median = df['Age'].median()
print(median)
df['Age']=df['Age'].fillna(median)
df.isnull().sum()
#Null Age values now has been filled with median values

In [None]:
#Cleaning Embarked - it's categorical so we will use mode

mode=df["Embarked"].mode()
print(mode)
df['Embarked']=df['Embarked'].fillna(mode[0])
df.isnull().sum()
#now here embarked missing or null values has been replaced with mode

In [None]:
#Cleaning the Cabin Column
# Most of the data for the cleaning column is missing so we will do something different
# We can drop this or create a new column from it (has_cabin)

df["Has Cabin"] = df['Cabin'].notna().astype(int)
df.head()

In [None]:
df.drop("Cabin" , axis=1 , inplace=True)
df.head()

In [None]:
df.isnull().sum()

**Now we are done with cleaning : It's Analysis Part**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

First Univariate Analysis: Analysis a single component or variable at once

In [None]:
df.head(10)

Categorical - Count Plots (lets analyze them in the figures)

In [None]:
print("Univariate Analysis: ")
fig,axes= plt.subplots(3 , 3 , figsize=(18 , 22)) # creating 6 sub plots - 2 x 3 grid
fig.suptitle("Univariate Analysis of the Categorical Columns" , fontsize=16)

sns.set_theme(style="whitegrid")
sns.set_style("darkgrid")
sns.set_palette("pastel")
#sns.countplot(which subplot , which variable, which data).set_title(title of the subplot)

sns.countplot(ax=axes[0,0] , x="Survived" , data=df).set_title("Survival Analysis")
sns.countplot(ax=axes[0,1] , x="Pclass", data=df).set_title("Class Analysis")
sns.countplot(ax=axes[0,2] , x="Sex" , data=df).set_title("Sex Analysis")
sns.countplot(ax=axes[1,0], x="Embarked", data=df).set_title("Embarked Analysis")
sns.countplot(ax=axes[1,1] , x="Has Cabin" , data=df).set_title("Has Cabin Analysis")
sns.countplot(ax=axes[1,2] , x="SibSp" , data=df).set_title("SibSp Analysis")
sns.countplot(ax=axes[2,0] , x="Parch" , data=df).set_title("Parch Analysis")
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

KEY INSIGHTS:

---
-Survival Rate is lower than Non-Survival  
-Most people were from the lower class.  
-Maximum were male passengers.  
-Most people onboarded from S = Southampton.   
-Few people had a cabin.   
-Maximum people were travelling alone.


**For Numerical- Histplot**

In [None]:
print("Numerical Unvariate Analysis : ")
fig , axes= plt.subplots(1, 2, figsize=(22, 12))
sns.histplot(ax=axes[0] , x="Age" , data=df , bins=40 , kde=True).set_title("Age Distribution")
sns.histplot(ax=axes[1] , x="Fare" , data=df , bins =40 , kde=True).set_title("Fare Distribution")
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

**Key Insights**

---
- Maximum number of people are aged between 28-29
- Fare Distribution is right-skewed with maximum people having low fare



Bi-Variate Analaysis.   
Analysis of two column relations

In [None]:
df.info()

In [None]:
#Bi-Variate Analysis : Feature vs Survival

fig , axes= plt.subplots(2, 2, figsize=(22, 12))

sns.barplot(ax=axes[0][0] , x="Sex" , y="Survived" , data=df).set_title("Sex vs Survival")
sns.barplot(ax=axes[0][1] , x="Pclass" , y="Survived" , data=df).set_title("Pclass vs Survival")
sns.barplot(ax=axes[1][0] , x="Embarked" , y="Survived" , data=df).set_title("Embarked vs Survival")
sns.barplot(ax=axes[1][1] , x="Has Cabin" , y="Survived" , data=df).set_title("Has Cabin vs Survival")
plt.tight_layout(rect=[0, 0, 1, 0.96])

Insights :


---
- Poor Dies , Rich Survives - more than 60% upperclass people survived while survival rate of lowerclass people was around only 25%.
- Among the survivals most of them were females (Why always men)
- Survival Rate was high who onboarded from Cherbourg (Maybe from here most of the upperclass and female onboarded)
- Those who had cabin survived more (Which is justifiable as rich people were in the cabin)


In [None]:
# Age vs. Survival
g = sns.FacetGrid(df, col='Survived', height=6)
g.map(sns.histplot, 'Age', bins=25, kde=True)
plt.suptitle('Age Distribution by Survival Status', y=1.02)
plt.show()

Insights:


---
- Survival rate of infant and young children was higher  
- Survival of adults was lower
- Older people >65 didn't survived


In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(y='Fare', data=df)
plt.title('Box Plot for Ticket Price')
plt.ylabel("Fare")
plt.show()
#Outliers in the fare distribution


**Feature Engineering**

In [None]:
## Creating a new column family size using the sibling spouse and parent child data
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# 2. Creating a new column isAlone from familySize column , those who were alone in the titanic
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

print("Created 'FamilySize' and 'IsAlone' features:")
df[['FamilySize', 'IsAlone']].head()

In [None]:
df.head()

Now we can find family people or single died most

In [None]:
# Analyze the new family-related features against survival
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Survival Rate by FamilySize
sns.barplot(ax=axes[0], x='FamilySize', y='Survived', data=df).set_title('Survival Rate by Family Size')

# Survival Rate by IsAlone
sns.barplot(ax=axes[1], x='IsAlone', y='Survived', data=df).set_title('Survival Rate for Lone Travelers')

plt.show()

Insights:


---
- Singles playing the Hero part and died more
- Family size of 2 - 4 had the most survival rate

In [None]:
# 3. Extract 'Title' from the 'Name' column
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

# Let's see the different titles
print("Extracted Titles:")
df['Title'].value_counts()

In [None]:
# Simplify the titles by grouping rare ones into a 'Rare' category
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

# Let's see the survival rate by the new, cleaned titles
plt.figure(figsize=(12, 6))
sns.barplot(x='Title', y='Survived', data=df)
plt.title('Survival Rate by Title')
plt.ylabel('Survival Probability')
plt.show()

Insights :

---
- Married womens survived most following the unmarried females.
-


Multivariate Analysis

In [None]:
# Survival rate by Pclass and Sex
sns.catplot(x='Pclass', y='Survived', hue='Sex', data=df, kind='bar', height=6, aspect=1.5)
plt.title('Survival Rate by Pclass and Sex')
plt.ylabel('Survival Probability')
plt.show()

# Insights: Females in all classes had a significantly higher survival rate than males.

In [None]:
# Violin plot to see age distribution by sex and survival status
plt.figure(figsize=(14, 8))
sns.violinplot(x='Sex', y='Age', hue='Survived', data=df, split=True, palette={0: 'blue', 1: 'orange'})
plt.title('Age Distribution by Sex and Survival')
plt.show()

Insights:


---
- younger male kids survived more
- survival and non-survival rate of the adult and older males were similar
- younger female kids survived less
- survival rate adult and older females were higher than non-survival rate


Correlation Analysis

In [None]:
# Correlation Heatmap for numerical features
plt.figure(figsize=(14, 10))
numeric_cols = df.select_dtypes(include=np.number)
correlation_matrix = numeric_cols.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='Blues', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

Insights:


---



---
- With the increase of Pclass (chances of having cabin , fare and survival rate decreases) eg: 3rd class less fare , less survival chance
- Family size highly correlated with the Parch and SibSp



In [None]:
# Install ydata-profiling
!pip install ydata-profiling -q

In [None]:
# Generating the profiling report
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Titanic Dataset Profiling Report")

# Display the report in the notebook
profile.to_notebook_iframe()