# 🧪 Classification EDA Master Template
Adaptable and professional EDA pipeline for real-world classification datasets.


In [None]:
# Step 0: Setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from scipy.stats import iqr

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 5)

# Load Dataset
df = sns.load_dataset('titanic')
drop_cols = ['deck', 'embark_town', 'class', 'alive', 'who', 'embarked', 'adult_male', 'alone']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)
df.head()

In [None]:
# Step 1: Basic Info
def basic_info(df):
    print("🔹 Shape:", df.shape)
    print("\n🔹 Data Types:\n", df.dtypes)
    print("\n🔹 Null Values:\n", df.isnull().sum())
    print("\n🔹 Sample Rows:\n", df.head())

basic_info(df)

In [None]:
# Step 2: Target Distribution
def plot_target_distribution(df, target):
    plt.figure(figsize=(6,4))
    sns.countplot(x=target, data=df)
    plt.title(f"Target Distribution: {target}")
    plt.show()

plot_target_distribution(df, 'survived')

In [None]:
# Step 3: Categorical Univariate Analysis
def univariate_categorical(df):
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        if df[col].nunique() <= 20:
            print(f"🔹 Value Counts for {col}:\n", df[col].value_counts())
            sns.countplot(x=col, data=df, order=df[col].value_counts().index)
            plt.title(f"Count Plot - {col}")
            plt.xticks(rotation=45)
            plt.show()

univariate_categorical(df)

In [None]:
# Step 4: Numeric Univariate Analysis
def optimal_bins(data):
    data = data.dropna()
    bin_width = 2 * iqr(data) / np.cbrt(len(data))
    if bin_width == 0:
        return 30
    bins = int((data.max() - data.min()) / bin_width)
    return max(10, min(bins, 100))

def plot_numeric_distribution(df):
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        plt.figure(figsize=(12, 6))
        bins = optimal_bins(df[col])
        sns.histplot(df[col], kde=True, bins=bins, color='lightblue')
        plt.axvline(df[col].mean(), color='blue', linestyle='--', linewidth=2, label='Mean')
        plt.axvline(df[col].median(), color='red', linestyle='-', linewidth=2, label='Median')
        skew = df[col].skew()
        skew_type = 'Right Skewed' if skew > 1 else 'Left Skewed' if skew < -1 else 'Symmetric'
        plt.title(f"Distribution of {col} | Skew: {skew:.2f} ({skew_type})")
        plt.legend()
        plt.show()

        plt.figure(figsize=(10, 2))
        sns.boxplot(x=df[col], color='salmon')
        plt.title(f"Boxplot: {col}")
        plt.show()

plot_numeric_distribution(df)

In [None]:
# Step 5: Categorical vs Target
def categorical_vs_target(df, target):
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        if df[col].nunique() <= 20:
            sns.countplot(x=col, hue=target, data=df)
            plt.title(f"{col} vs {target}")
            plt.xticks(rotation=45)
            plt.legend(title=target)
            plt.show()

categorical_vs_target(df, 'survived')

In [None]:
# Step 6: Correlation Heatmap
def plot_correlation(df):
    num_cols = df.select_dtypes(include=np.number).columns
    plt.figure(figsize=(12, 8))
    corr = df[num_cols].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', square=True)
    plt.title("Correlation Heatmap")
    plt.show()

plot_correlation(df)

In [None]:
# Step 7: Missing Values
def plot_missing(df):
    nulls = df.isnull().sum()
    nulls = nulls[nulls > 0].sort_values(ascending=False)
    if not nulls.empty:
        plt.figure(figsize=(8, 4))
        sns.barplot(x=nulls.index, y=nulls.values, palette="viridis")
        plt.title("Missing Values per Column")
        plt.xticks(rotation=45)
        plt.ylabel("Missing Count")
        plt.show()
    else:
        print("✅ No missing values.")

plot_missing(df)