In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set plot styles for better aesthetics
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# Load the Titanic dataset
df = sns.load_dataset('titanic')

# Display the first 5 rows to get a feel for the data
print("First 5 rows of the Titanic dataset:")
display(df.head())

# Get a concise summary of the dataframe
print("\nDataset Information:")
df.info()


In [None]:
# Summary statistics for numerical features
print("Descriptive Statistics for Numerical Features:")
display(df.describe())

# Summary statistics for categorical features
print("\nDescriptive Statistics for Categorical Features:")
display(df.describe(include=['object', 'category']))


In [None]:
# Plotting distributions for numerical features
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Distribution of Numerical Features', fontsize=16)

# Histogram for Age
sns.histplot(df['age'].dropna(), kde=True, ax=axes[0, 0], bins=30)
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')


# Boxplot for Age
sns.boxplot(x=df['age'], ax=axes[0, 1])
axes[0, 1].set_title('Age Boxplot')
axes[0, 1].set_xlabel('Age')

# Histogram for Fare
sns.histplot(df['fare'], kde=True, ax=axes[1, 0], bins=40)
axes[1, 0].set_title('Fare Distribution')
axes[1, 0].set_xlabel('Fare')
axes[1, 0].set_ylabel('Frequency')

# Boxplot for Fare
sns.boxplot(x=df['fare'], ax=axes[1, 1])
axes[1, 1].set_title('Fare Boxplot')
axes[1, 1].set_xlabel('Fare')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
# Creating count plots for key categorical features
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Count of Categorical Features', fontsize=16)

# Survived
sns.countplot(x='survived', data=df, ax=axes[0, 0])
axes[0, 0].set_title('Survival Count (0 = No, 1 = Yes)')
axes[0, 0].set_xlabel('Survived')
axes[0, 0].set_ylabel('Count')


# Pclass (Passenger Class)
sns.countplot(x='pclass', data=df, ax=axes[0, 1])
axes[0, 1].set_title('Passenger Class Distribution')
axes[0, 1].set_xlabel('Class')
axes[0, 1].set_ylabel('Count')


# Sex
sns.countplot(x='sex', data=df, ax=axes[1, 0])
axes[1, 0].set_title('Gender Distribution')
axes[1, 0].set_xlabel('Sex')
axes[1, 0].set_ylabel('Count')


# Embarked
sns.countplot(x='embarked', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Port of Embarkation')
axes[1, 1].set_xlabel('Port (C=Cherbourg, Q=Queenstown, S=Southampton)')
axes[1, 1].set_ylabel('Count')


plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
# Bivariate analysis: Feature vs. Survival
fig, axes = plt.subplots(1, 3, figsize=(20, 7))
fig.suptitle('Survival Rate by Categorical Features', fontsize=16)

# Survival rate by Sex
sns.barplot(x='sex', y='survived', data=df, ax=axes[0])
axes[0].set_title('Survival Rate by Sex')
axes[0].set_ylabel('Survival Rate')

# Survival rate by Pclass
sns.barplot(x='pclass', y='survived', data=df, ax=axes[1])
axes[1].set_title('Survival Rate by Passenger Class')
axes[1].set_ylabel('Survival Rate')

# Survival rate by Embarked
sns.barplot(x='embarked', y='survived', data=df, ax=axes[2])
axes[2].set_title('Survival Rate by Port of Embarkation')
axes[2].set_ylabel('Survival Rate')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Age distribution vs. Survival
g = sns.FacetGrid(df, col='survived', height=6)
g.map(sns.histplot, 'age', kde=True, bins=25)
g.fig.suptitle('Age Distribution by Survival Status', y=1.03)
plt.show()


In [None]:
# Calculate the correlation matrix for numerical features
# Note: We need to convert boolean 'alone' to int for this to work
df_corr = df.drop(columns=['who', 'adult_male', 'deck', 'embark_town', 'alive', 'class']) # Drop redundant/string columns
df_corr['alone'] = df_corr['alone'].astype(int)
numerical_cols = df_corr.select_dtypes(include=np.number)
correlation_matrix = numerical_cols.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Create a pairplot to see relationships between key variables, colored by survival status
# We select a subset of columns to keep the plot readable
pairplot_df = df[['survived', 'pclass', 'age', 'fare', 'sex']]
sns.pairplot(pairplot_df, hue='survived', diag_kind='kde', palette='viridis')
plt.suptitle('Pairplot of Key Features by Survival Status', y=1.02)
plt.show()
