Exploratory Data Analysis (EDA) script that produces:

* Cleaned string columns (whitespace stripped)
* Missing-value report
* Histograms + KDE for numeric features
* Horizontal count-plots for categorical features (top 15 categories)
* Descriptive statistics (`describe()`)
* Correlation heatmap (numeric only)

In [None]:
import math
import warnings
from typing import List, Optional, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# parameters

PLOTS_PER_ROW = 3
HIST_BINS = 30
FIGSIZE_BASE = 6           
ROW_HEIGHT = 4
CORR_FIGSIZE = (10, 8)
HEATMAP_CMAP = "coolwarm"
WARN_MISSING = True

In [None]:
def _setup_style():
    """
    Apply a clean seaborn style
    """
    sns.set_style("whitegrid")
    plt.rcParams["figure.figsize"] = (FIGSIZE_BASE * PLOTS_PER_ROW, ROW_HEIGHT)


def clean_strings(
        df: pd.DataFrame, 
        inplace: bool = False):
    
    if not inplace:
        df = df.copy()

    str_cols = df.select_dtypes(include=["object", "category"]).columns
    if len(str_cols):
        df[str_cols] = df[str_cols].apply(lambda col: col.str.strip() if col.dtype == "object" else col)

    return df


def split_features(
        df: pd.DataFrame):
    categorical_features = df.select_dtypes(include=["object", "category"]).columns.tolist()
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    return categorical_features, numerical_features


def plot_numerical(
    df: pd.DataFrame,
    cols: List[str],
    bins: int = HIST_BINS,
    kde: bool = True,
):
    if not cols:
        print("No numerical columns to plot.")
        return

    n = len(cols)
    rows = math.ceil(n / PLOTS_PER_ROW)
    fig, axes = plt.subplots(rows, PLOTS_PER_ROW, figsize=(PLOTS_PER_ROW * FIGSIZE_BASE, rows * ROW_HEIGHT))
    axes = axes.flatten() if rows > 1 else [axes] if rows == 1 and PLOTS_PER_ROW == 1 else axes

    for idx, col in enumerate(cols):
        ax = axes[idx]
        sns.histplot(data=df, x=col, kde=kde, bins=bins, ax=ax, color="#4c72b0")
        ax.set_title(f"Distribution of {col}", fontsize=12, pad=10)
        ax.set_xlabel("")
        ax.set_ylabel("")

    # hide unused subplots
    for idx in range(n, len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.show()


def plot_categorical(
    df: pd.DataFrame,
    cols: List[str],
    top_n: int = 15,
):

    if not cols:
        print("No categorical columns to plot.")
        return

    n = len(cols)
    rows = math.ceil(n / PLOTS_PER_ROW)
    fig, axes = plt.subplots(rows, PLOTS_PER_ROW, figsize=(PLOTS_PER_ROW * FIGSIZE_BASE, rows * ROW_HEIGHT))
    axes = axes.flatten() if rows > 1 else [axes] if rows == 1 and PLOTS_PER_ROW == 1 else axes

    for idx, col in enumerate(cols):
        ax = axes[idx]
        # limit to top N to avoid cluttered plots
        order = df[col].value_counts().head(top_n).index
        sns.countplot(y=col, data=df, order=order, ax=ax, palette="viridis")
        ax.set_title(f"Count Plot of {col}", fontsize=12, pad=10)
        ax.set_xlabel("")
        ax.set_ylabel("")

    # hide unused subplots
    for idx in range(n, len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    plt.show()


def summary_stats(
        df: pd.DataFrame, 
        cat_cols: List[str], 
        num_cols: List[str]):

    if num_cols:
        print("\n=== Numerical Features Summary ===")
        print(df[num_cols].describe())
    else:
        print("\nNo numerical columns found.")

    if cat_cols:
        print("\n=== Categorical Features Summary ===")
        print(df[cat_cols].describe())
    else:
        print("\nNo categorical columns found.")


def correlation_matrix(df: pd.DataFrame, 
                       figsize: Tuple[int, int] = CORR_FIGSIZE):

    num_df = df.select_dtypes(include=[np.number])
    if num_df.empty:
        print("No numeric columns for correlation matrix.")
        return

    plt.figure(figsize=figsize)
    sns.heatmap(num_df.corr(), cmap=HEATMAP_CMAP, annot=True, fmt=".2f", linewidths=0.5)
    plt.title("Correlation Matrix", fontsize=14, pad=15)
    plt.tight_layout()
    plt.show()


def missing_values_report(df: pd.DataFrame):

    missing = df.isnull().sum()
    percent = (missing / len(df)) * 100
    report = pd.DataFrame({"Missing": missing, "Percent": percent}).sort_values(by="Percent", ascending=False)
    total_missing_rows = df.isnull().any(axis=1).sum()
    print(f"\nTotal rows with at least one missing value: {total_missing_rows} ({total_missing_rows/len(df)*100:.2f5}%)")
    return report


def run_eda(
    df: pd.DataFrame,
    clean_str: bool = True,
    show_missing: bool = True,
):

    _setup_style()

    if clean_str:
        df = clean_strings(df, inplace=False)

    cat_cols, num_cols = split_features(df)

    if show_missing:
        print("\n=== Missing Values Report ===")
        print(missing_values_report(df))

    print("\n=== Numerical Histograms ===")
    plot_numerical(df, num_cols)

    print("\n=== Categorical Count-Plots ===")
    plot_categorical(df, cat_cols)

    print("\n=== Summary Statistics ===")
    summary_stats(df, cat_cols, num_cols)

    print("\n=== Correlation Matrix ===")
    correlation_matrix(df)

    print("\nEDA complete!")

In [None]:
run_eda(df)