## 📦 Setup

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Load data
df = pl.read_parquet("features_parquet/tickers_data/features_all_tickers_timeseries.parquet")
pdf = df.to_pandas()

# Preview and validate structure
print("✅ Shape:", pdf.shape)
print("🧾 Columns:", pdf.columns.tolist())
print("🆔 Unique tickers:", sorted(pdf['ticker'].dropna().unique().tolist()))
pdf.head()


### 🧼 Missing Data Heatmap

In [None]:
plt.figure(figsize=(14, 6))
sns.heatmap(pdf.isnull(), cbar=False, cmap="Reds")
plt.title("🧯 Missing Values (Rows x Columns)")
plt.tight_layout()
plt.show()

### 🧮 Define Metric Groups

In [None]:
metric_groups = {
    "Growth": ["eps_cagr_3y", "fcf_cagr_3y", "dividend_cagr_3y", "dividend_cagr_5y"],
    "Returns & Volatility": ["6m_return", "12m_return", "sector_relative_6m", "volatility", "max_drawdown_1y"],
    "Fundamentals": ["pe_ratio", "pfcf_ratio", "payout_ratio", "dividend_yield", "yield_vs_5y_median",
                     "net_debt_to_ebitda", "ebit_interest_cover", "ebit_interest_cover_capped"]
}

def clean(df, cols):
    return df[cols].replace([np.inf, -np.inf], np.nan).dropna()

### 📊 Histograms

In [None]:
for name, cols in metric_groups.items():
    df_group = clean(pdf, cols)
    df_group.hist(bins=30, figsize=(5 * min(3, len(cols)), 4 * (len(cols) // 3 + 1)), edgecolor='black', color='skyblue')
    plt.suptitle(f"📊 {name} - Histogram", fontsize=14)
    plt.tight_layout()
    plt.show()

### 📈 Boxplots

In [None]:
for name, cols in metric_groups.items():
    df_group = clean(pdf, cols)
    plt.figure(figsize=(6 * min(3, len(cols)), 4 * (len(cols) // 3 + 1)))
    for i, col in enumerate(cols):
        plt.subplot((len(cols) - 1) // 3 + 1, min(3, len(cols)), i + 1)
        sns.boxplot(x=df_group[col], color="lightblue")
        plt.title(col)
    plt.suptitle(f"📈 {name} - Boxplots", fontsize=14, y=1.02)
    plt.tight_layout()
    plt.show()

### 📌 Correlation Heatmaps

In [None]:
def clean_numeric(df, cols):
    sub_df = df[cols].replace([np.inf, -np.inf], np.nan).dropna()
    return sub_df

for group_name, cols in metric_groups.items():
    sub_df = clean_numeric(pdf, cols)
    if sub_df.shape[1] > 1:
        plt.figure(figsize=(6, 5))
        sns.heatmap(
            sub_df.corr(), annot=True, fmt=".2f", cmap="coolwarm",
            square=True, cbar=True, annot_kws={"size": 8}
        )
        plt.title(f"Correlation Heatmap: {group_name}")
        plt.tight_layout()
        plt.show()

### 🕸 Radar Chart (Normalized Per Group)

In [None]:
def plot_radar(df, tickers, features, title="Radar Chart"):
    df = df.set_index("ticker")
    df = df.replace([np.inf, -np.inf], np.nan)

    # Keep only tickers of interest
    df = df.loc[df.index.intersection(tickers)]

    # Convert features to numeric safely
    df = df.copy()
    for f in features:
        df[f] = pd.to_numeric(df[f], errors="coerce")

    # Drop incomplete rows
    df = df.dropna(subset=features)
    if df.empty:
        print(f"⚠️ No valid data for: {title}")
        return

    # Remove constant-value features (useless in radar)
    variable_features = [f for f in features if df[f].nunique(dropna=True) > 1]
    if len(variable_features) < 2:
        print(f"⚠️ Not enough variable features for radar: {title}")
        return

    # Normalize
    df_norm = (df[variable_features] - df[variable_features].min()) / (df[variable_features].max() - df[variable_features].min())
    angles = np.linspace(0, 2 * np.pi, len(variable_features), endpoint=False).tolist()
    angles += angles[:1]  # close loop

    # Start plot
    fig, ax = plt.subplots(figsize=(7, 7), subplot_kw=dict(polar=True))

    for ticker in df_norm.index:
        row = df_norm.loc[ticker]
        values = row.values.flatten().tolist() if hasattr(row, "values") else [row]
        values += values[:1]  # close loop
        if len(values) != len(angles):
            print(f"❌ Mismatch in angles/values for {ticker} — skipping.")
            continue
        ax.plot(angles, values, label=ticker)
        ax.fill(angles, values, alpha=0.1)

    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)
    ax.set_thetagrids(np.degrees(angles[:-1]), variable_features)
    ax.set_ylim(0, 1)
    ax.set_title(title, size=14)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
    plt.tight_layout()
    plt.show()



tickers = sorted(pdf['ticker'].dropna().unique().tolist())
for group_name, features in metric_groups.items():
    for i in range(0, len(tickers), 5):
        batch = tickers[i:i + 5]
        plot_radar(pdf.copy(), batch, features, title=f"{group_name}: {', '.join(batch)}")


### 📉 Raw Feature Values per Ticker

In [None]:
def plot_raw(df, tickers, features, title="Raw Feature Values"):
    df_sub = df[df["ticker"].isin(tickers)][["ticker"] + features].replace([np.inf, -np.inf], np.nan).dropna()
    df_sub.set_index("ticker", inplace=True)
    ax = df_sub.plot(kind="bar", figsize=(10, 6))
    plt.title(title)
    plt.ylabel("Raw Value")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

ENABLE_RAW_PLOTS = False

if ENABLE_RAW_PLOTS:
    for group_name, features in metric_groups.items():
        for i in range(0, len(tickers), 5):
            batch = tickers[i:i + 5]
            plot_raw(pdf.copy(), batch, features, title=f"{group_name}: {', '.join(batch)}")