# Exploratory Data Analysis (EDA) on the Titanic Dataset
**Author:** Triveni
**Task:** EDA (Data Analyst Internship Task 5)
**Date:** 2025-09-29

**Goal:** Extract insights using visual and statistical exploration.

In [None]:
# Cell: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# visualization settings
%matplotlib inline
sns.set(style="whitegrid", context='notebook')
plt.rcParams['figure.figsize'] = (10,6)

In [None]:
# Cell: Load dataset (seaborn built-in Titanic)
titanic = sns.load_dataset('titanic')  # convenient; similar to Kaggle's structure
df = titanic.copy()
df.head()

In [None]:
# Cell: Quick info and summary
print("Shape:", df.shape)
print("\nInfo:")
df.info()
print("\nNumerical Describe:")
display(df.describe(include=[np.number]))
print("\nCategorical Describe:")
display(df.describe(include=['object','category','bool']))

In [None]:
# Cell: Missing values
missing = df.isnull().sum().sort_values(ascending=False)
missing_pct = (missing / len(df) * 100).round(2)
pd.DataFrame({'missing_count': missing, 'missing_pct': missing_pct})

In [None]:
# Cell: Basic counts
print("Unique values (sample):")
for col in ['survived','sex','class','embarked','who','alive','deck','embark_town','alone']:
    if col in df.columns:
        print(f"\n{col} value counts:")
        display(df[col].value_counts(dropna=False))

In [None]:
# Cell: Univariate visualizations - numerical
num_cols = ['age','fare','sibsp','parch']  # available numeric-ish columns
for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

In [None]:
# Cell: Univariate - categorical barplots
cat_cols = ['sex','class','embarked','who','adult_male','alone']
for col in cat_cols:
    if col in df.columns:
        plt.figure(figsize=(6,4))
        sns.countplot(data=df, x=col, order=df[col].value_counts().index)
        plt.title(f'Count of {col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
# Cell: Bivariate - Survival rate by categorical features
cat_features = ['sex','class','who','alone','embarked']
for col in cat_features:
    if col in df.columns:
        plt.figure(figsize=(6,4))
        sns.barplot(data=df, x=col, y='survived', ci=None)
        plt.title(f'Survival rate by {col}')
        plt.ylabel('Survival rate (0-1)')
        plt.ylim(0,1)
        plt.tight_layout()
        plt.show()

In [None]:
# Cell: Survival vs Age (numerical)
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='survived', y='age')
plt.title('Age distribution by Survival')
plt.xticks([0,1], ['Died','Survived'])
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,5))
sns.kdeplot(data=df[df['survived']==1]['age'].dropna(), label='Survived')
sns.kdeplot(data=df[df['survived']==0]['age'].dropna(), label='Died')
plt.title('Age KDE by Survival')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Cell: Fare distribution by survival
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='survived', y='fare')
plt.title('Fare distribution by Survival')
plt.xticks([0,1], ['Died','Survived'])
plt.tight_layout()
plt.show()

In [None]:
# Cell: Correlation heatmap (numerical features)
num_df = df.select_dtypes(include=[np.number])
corr = num_df.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation matrix (numerical)')
plt.tight_layout()
plt.show()

In [None]:
# Cell: Pairplot (subset, to speed up)
subset = df[['survived','age','fare','sibsp','parch']].dropna()
sns.pairplot(subset, hue='survived', diag_kind='kde', corner=True)
plt.suptitle('Pairplot (subset) by Survival', y=1.02)
plt.show()

In [None]:
# Cell: Deck & Embarked - missingness insight
if 'deck' in df.columns:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x='deck', order=df['deck'].value_counts().index)
    plt.title('Count by Deck (many missing = NaN)')
    plt.tight_layout()
    plt.show()

if 'embark_town' in df.columns:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x='embark_town', order=df['embark_town'].value_counts().index)
    plt.title('Count by Embark Town')
    plt.tight_layout()
    plt.show()

In [None]:
# Cell: Handling missing values example (simple feature engineering)
df_fe = df.copy()
# Fill age with median, embark_town with mode, deck with 'Unknown'
df_fe['age'] = df_fe['age'].fillna(df_fe['age'].median())
if 'embark_town' in df_fe:
    df_fe['embark_town'] = df_fe['embark_town'].fillna(df_fe['embark_town'].mode()[0])
if 'deck' in df_fe:
    df_fe['deck'] = df_fe['deck'].fillna('Unknown')

# Create 'family_size' from sibsp + parch
df_fe['family_size'] = df_fe['sibsp'].fillna(0) + df_fe['parch'].fillna(0)
df_fe[['age','family_size','deck']].head()

In [None]:
# Cell: Outliers check - fare
plt.figure(figsize=(8,4))
sns.boxplot(x=df_fe['fare'])
plt.title('Fare boxplot (outliers present)')
plt.tight_layout()
plt.show()

In [None]:
# Cell: Summary statistics and key findings (programmatic)
survival_by_sex = df.groupby('sex')['survived'].mean().sort_values(ascending=False)
survival_by_class = df.groupby('class')['survived'].mean().sort_values(ascending=False)
survival_by_age_group = pd.cut(df['age'], bins=[0,12,18,35,60,100]).astype('category')
survival_by_age_group = df.groupby(pd.cut(df['age'], bins=[0,12,18,35,60,100]))['survived'].mean()
print("Survival rate by sex:\n", survival_by_sex)
print("\nSurvival rate by class:\n", survival_by_class)
print("\nSurvival rate by age group:\n", survival_by_age_group)

# Conclusions & Recommendations
- Women had significantly higher survival rates than men.
- Passengers in higher classes (First) had higher survival rates.
- Children (0-12) show reasonably better survival relative to some adult groups; certain age groups had mixed outcomes.
- Higher fare correlates weakly with survival (likely proxy for class).
- Missing data: `age`, `deck`, `embark_town` notable — handle carefully before modeling.
- Recommendations: Impute missing values (median for age), encode categorical variables (one-hot or ordinal where appropriate), create features (family_size, is_alone), remove or cap fare outliers or transform (log).