In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# === Load data ===
df = pd.read_csv("allsources_percentiles.csv")

# === Output directory ===
output_dir = "eda_output"
os.makedirs(output_dir, exist_ok=True)

print("\n📏 Descriptive Statistics:")
summary = df.describe().T
summary["skew"] = df.skew(numeric_only=True)
summary["kurtosis"] = df.kurtosis(numeric_only=True)
summary.to_csv(f"{output_dir}/descriptive_stats.csv")


📏 Descriptive Statistics:


In [4]:
# === Histograms for all numeric columns ===
print("\n📊 Generating histograms...")
for col in df.select_dtypes(include=np.number).columns:
    plt.figure()
    sns.histplot(df[col].dropna(), kde=True, bins=30)
    plt.title(f"Histogram: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/hist_{col}.png")
    plt.close()


📊 Generating histograms...


In [5]:
# === Correlation matrix ===
print("\n📈 Generating correlation heatmap...")
plt.figure(figsize=(10, 8))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", annot=True, fmt=".2f", square=True)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.savefig(f"{output_dir}/correlation_matrix.png")
plt.close()


📈 Generating correlation heatmap...


In [None]:
# === Pairplot of top 5 most skewed features ===
print("\n🔎 Plotting most skewed features...")
top_skewed = summary["skew"].abs().sort_values(ascending=False).head(5).index.tolist()
sns.pairplot(df[top_skewed].dropna(), diag_kind="kde")
plt.savefig(f"{output_dir}/pairplot_top_skewed.png")
plt.close()

In [None]:
# === Boxplots for outlier inspection ===
print("\n📦 Generating boxplots for outlier inspection...")
for col in top_skewed:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot: {col}")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/box_{col}.png")
    plt.close()

print("\n✅ EDA complete! Results saved to:", output_dir)

In [7]:
# === Multi-panel histogram grid for appendix ===
print("\n📊 Generating histogram grid...")

numeric_cols = df.select_dtypes(include=np.number).columns
n_cols = 4  # 4 plots per row
n_rows = int(np.ceil(len(numeric_cols) / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.histplot(df[col].dropna(), kde=True, bins=30, ax=axes[i])
    axes[i].set_title(col)
    axes[i].set_xlabel("")
    axes[i].set_ylabel("")

# Turn off any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].axis("off")

plt.tight_layout()
plt.savefig(f"{output_dir}/histogram_grid_appendix.png", dpi=300)
plt.close()
print(f"✅ Saved: {output_dir}/histogram_grid_appendix.png")


📊 Generating histogram grid...
✅ Saved: eda_output/histogram_grid_appendix.png


In [8]:
# === Multi-panel histogram grid for appendix (excluding 'year') ===
print("\n📊 Generating histogram grid (excluding 'year' columns)...")

# Filter out columns containing 'year'
numeric_cols = [
    col for col in df.select_dtypes(include=np.number).columns
    if "year" not in col.lower()
]

n_cols = 4  # 4 plots per row
n_rows = int(np.ceil(len(numeric_cols) / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 3))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.histplot(df[col].dropna(), kde=True, bins=30, ax=axes[i])
    axes[i].set_title(col)
    axes[i].set_xlabel("")
    axes[i].set_ylabel("")

# Turn off unused subplots
for j in range(i + 1, len(axes)):
    axes[j].axis("off")

plt.tight_layout()
plt.savefig(f"{output_dir}/histogram_grid_appendix.png", dpi=300)
plt.close()
print(f"✅ Saved: {output_dir}/histogram_grid_appendix.png")



📊 Generating histogram grid (excluding 'year' columns)...
✅ Saved: eda_output/histogram_grid_appendix.png


In [9]:
# === Bar charts for skew and kurtosis (excluding 'year') ===
print("\n📊 Plotting skew and kurtosis bar charts...")

# Select relevant numeric columns
numeric_cols = [
    col for col in df.select_dtypes(include=np.number).columns
    if "year" not in col.lower()
]

# Compute skew and kurtosis
skew_vals = df[numeric_cols].skew(numeric_only=True)
kurtosis_vals = df[numeric_cols].kurtosis(numeric_only=True)

# Plot skewness
plt.figure(figsize=(10, 6))
sns.barplot(x=skew_vals.index, y=skew_vals.values)
plt.xticks(rotation=90)
plt.ylabel("Skewness")
plt.title("Skewness of Numeric Variables (excluding 'year')")
plt.axhline(0, color='gray', linestyle='--')
plt.tight_layout()
plt.savefig(f"{output_dir}/skewness_barplot.png", dpi=300)
plt.close()

# Plot kurtosis
plt.figure(figsize=(10, 6))
sns.barplot(x=kurtosis_vals.index, y=kurtosis_vals.values)
plt.xticks(rotation=90)
plt.ylabel("Kurtosis")
plt.title("Kurtosis of Numeric Variables (excluding 'year')")
plt.axhline(0, color='gray', linestyle='--')
plt.tight_layout()
plt.savefig(f"{output_dir}/kurtosis_barplot.png", dpi=300)
plt.close()

print(f"✅ Saved: {output_dir}/skewness_barplot.png and kurtosis_barplot.png")



📊 Plotting skew and kurtosis bar charts...
✅ Saved: eda_output/skewness_barplot.png and kurtosis_barplot.png


In [10]:
print("\n📊 Plotting skew and kurtosis (log y-axis, shortened labels)...")

# Keep numeric columns excluding 'year'
numeric_cols = [
    col for col in df.select_dtypes(include=np.number).columns
    if "year" not in col.lower()
]

# Compute skew and kurtosis
skew_vals = df[numeric_cols].skew(numeric_only=True)
kurt_vals = df[numeric_cols].kurtosis(numeric_only=True)

# Shorten variable names for display
def shorten(name, max_len=12):
    return name if len(name) <= max_len else name[:max_len-3] + "..."

short_names = [shorten(name) for name in skew_vals.index]

# --- Skewness Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(x=short_names, y=np.abs(skew_vals.values))  # use absolute skew
plt.yscale('log')
plt.xticks(rotation=90)
plt.ylabel("Absolute Skew (log scale)")
plt.title("Skewness of Numeric Variables")
plt.tight_layout()
plt.savefig(f"{output_dir}/skewness_barplot_log.png", dpi=300)
plt.close()

# --- Kurtosis Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(x=short_names, y=np.abs(kurt_vals.values))  # use absolute kurtosis
plt.yscale('log')
plt.xticks(rotation=90)
plt.ylabel("Absolute Kurtosis (log scale)")
plt.title("Kurtosis of Numeric Variables")
plt.tight_layout()
plt.savefig(f"{output_dir}/kurtosis_barplot_log.png", dpi=300)
plt.close()

print(f"✅ Saved: {output_dir}/skewness_barplot_log.png and kurtosis_barplot_log.png")


📊 Plotting skew and kurtosis (log y-axis, shortened labels)...
✅ Saved: eda_output/skewness_barplot_log.png and kurtosis_barplot_log.png


In [13]:
print("\n📊 Plotting skew and kurtosis (log y-axis, full variable names)...")

# Numeric columns excluding 'year'
numeric_cols = [
    col for col in df.select_dtypes(include=np.number).columns
    if "year" not in col.lower()
]

# Compute skew and kurtosis
skew_vals = df[numeric_cols].skew(numeric_only=True)
kurt_vals = df[numeric_cols].kurtosis(numeric_only=True)

# --- Skewness Plot ---
plt.figure(figsize=(max(10, len(numeric_cols) * 0.4), 6))
sns.barplot(x=skew_vals.index, y=np.abs(skew_vals.values))
plt.yscale('log')
plt.xticks(rotation=90)
plt.ylabel("Absolute Skew (log scale)")
plt.title("Skewness of Numeric Variables")
plt.tight_layout()
plt.savefig(f"{output_dir}/skewness_barplot_log_fullnames.png", dpi=300)
plt.close()

# --- Kurtosis Plot ---
plt.figure(figsize=(max(10, len(numeric_cols) * 0.4), 6))
sns.barplot(x=kurt_vals.index, y=np.abs(kurt_vals.values))
plt.yscale('log')
plt.xticks(rotation=90)
plt.ylabel("Absolute Kurtosis (log scale)")
plt.title("Kurtosis of Numeric Variables")
plt.tight_layout()
plt.savefig(f"{output_dir}/kurtosis_barplot_log_fullnames.png", dpi=300)
plt.close()

print(f"✅ Saved: {output_dir}/skewness_barplot_log_fullnames.png and kurtosis_barplot_log_fullnames.png")


📊 Plotting skew and kurtosis (log y-axis, full variable names)...
✅ Saved: eda_output/skewness_barplot_log_fullnames.png and kurtosis_barplot_log_fullnames.png
