In [None]:
# Titanic Dataset - EDA
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

titanic_data = sns.load_dataset('titanic')
titanic_data.head()

In [None]:

print("Shape:", titanic_data.shape)
print("Columns:", titanic_data.columns)
print("Missing Percentage:", (titanic_data.isnull().sum()/len(titanic_data))*100)
print("Data Types:", titanic_data.dtypes)
print("Summary Statistics:", titanic_data.describe(include='all'))
print("Missing Values:", titanic_data.isnull().sum())


In [None]:

plt.figure(figsize=(10, 6))
sns.heatmap(titanic_data.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Values Heatmap')
plt.show()

In [None]:

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

titanic_data_filled = titanic_data.copy()
num_cols = titanic_data_filled.select_dtypes(include=['number']).columns
cat_cols = titanic_data_filled.select_dtypes(exclude=['number']).columns

imputer = IterativeImputer(estimator=RandomForestRegressor(), random_state=0)
titanic_data_filled[num_cols] = imputer.fit_transform(titanic_data_filled[num_cols])

for col in cat_cols:
    titanic_data_filled[col] = titanic_data_filled[col].fillna(titanic_data_filled[col].mode()[0])

print("\nMissing Values After Imputation:\n", titanic_data_filled.isnull().sum())


In [None]:

sns.countplot(x='survived', data=titanic_data_filled)
plt.title('Survival Count')
plt.show()

sns.countplot(x='pclass', data=titanic_data_filled)
plt.title('Passenger Class Distribution')
plt.show()

sns.histplot(x='age', data=titanic_data_filled, kde=True)
plt.title('Age Distribution')
plt.show()

sns.countplot(x='sex', data=titanic_data_filled)
plt.title('Sex Distribution')
plt.show()



In [None]:

sns.countplot(x='sex', hue='survived', data=titanic_data_filled)
plt.title('Survival by Gender')
plt.show()

sns.countplot(x='pclass', hue='survived', data=titanic_data_filled)
plt.title('Survival by Passenger Class')
plt.show()

titanic_data_filled['age_group'] = pd.cut(titanic_data_filled['age'], bins=[0,18,65,100], labels=['Child','Adult','Senior'])
sns.countplot(x='age_group', hue='survived', data=titanic_data_filled)
plt.title('Survival by Age Group')
plt.show()

sns.catplot(x='sex', hue='survived', col='pclass', kind='count', data=titanic_data_filled)
plt.show()


In [None]:

bins = [0, 12, 18, 40, 60, 80]
labels = ['Child', 'Teenager', 'Adult', 'Middle-aged', 'Senior']
titanic_data['age_group'] = pd.cut(titanic_data['age'], bins=bins, labels=labels)

sns.countplot(x='age_group', hue='survived', data=titanic_data, palette='magma')
plt.title('Survival Based on Age Group')
plt.show()



In [None]:

plt.figure(figsize=(12, 8))
numeric_data = titanic_data.select_dtypes(include=np.number)
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()