## 📦 Setup

In [None]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- Config ---
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

FEATURES_DIR = "../features_data/tickers_history"

with open("../features_data/tickers/us_tickers_subset_limited.txt", "r") as f:
    TICKERS_TO_LOAD = [line.strip() for line in f if line.strip()]


# --- Load only selected ticker files ---
dfs = []
for ticker in TICKERS_TO_LOAD:
    path = os.path.join(FEATURES_DIR, f"{ticker}.parquet")
    if not os.path.exists(path):
        print(f"⚠️ File not found for {ticker}: {path}")
        continue
    dfs.append(pl.read_parquet(path))

if not dfs:
    raise ValueError("❌ No valid ticker files loaded.")

df = pl.concat(dfs, how="vertical_relaxed")
df = df.filter(pl.col("ticker").is_not_null())
pdf = df.to_pandas()

# --- Preview ---
print("✅ Loaded tickers:", TICKERS_TO_LOAD)
print("📦 Data shape:", pdf.shape)
print("🧾 Columns:", pdf.columns.tolist())
print("🆔 Tickers present:", pdf['ticker'].unique())

# Optional preview
display(pdf.head())


### 🧼 Missing Data Heatmap

In [None]:
plt.figure(figsize=(14, 6))
sns.heatmap(pdf.isnull(), cbar=False, cmap="Reds")
plt.title("🧯 Missing Values (Rows x Columns)")
plt.tight_layout()
plt.show()

### 🧮 Define Metric Groups

In [None]:
metric_groups = {
    "Growth": ["eps_cagr_3y", "fcf_cagr_3y", "dividend_cagr_3y", "dividend_cagr_5y"],
    "Returns & Volatility": ["6m_return", "12m_return", "sector_relative_6m", "volatility", "max_drawdown_1y"],
    "Fundamentals": ["pe_ratio", "pfcf_ratio", "payout_ratio", "dividend_yield", "yield_vs_5y_median",
                     "net_debt_to_ebitda", "ebit_interest_cover", "ebit_interest_cover_capped"]
}

def clean(df, cols):
    return df[cols].replace([np.inf, -np.inf], np.nan).dropna()

### 📊 Histograms

In [None]:
for name, cols in metric_groups.items():
    df_group = clean(pdf, cols)
    df_group.hist(bins=30, figsize=(5 * min(3, len(cols)), 4 * (len(cols) // 3 + 1)), edgecolor='black', color='skyblue')
    plt.suptitle(f"📊 {name} - Histogram", fontsize=14)
    plt.tight_layout()
    plt.show()

### 📈 Boxplots

In [None]:
for name, cols in metric_groups.items():
    df_group = clean(pdf, cols)
    plt.figure(figsize=(6 * min(3, len(cols)), 4 * (len(cols) // 3 + 1)))
    for i, col in enumerate(cols):
        plt.subplot((len(cols) - 1) // 3 + 1, min(3, len(cols)), i + 1)
        sns.boxplot(x=df_group[col], color="lightblue")
        plt.title(col)
    plt.suptitle(f"📈 {name} - Boxplots", fontsize=14, y=1.02)
    plt.tight_layout()
    plt.show()

### 📌 Correlation Heatmaps

In [None]:
def clean_numeric(df, cols):
    sub_df = df[cols].replace([np.inf, -np.inf], np.nan).dropna()
    return sub_df

for group_name, cols in metric_groups.items():
    sub_df = clean_numeric(pdf, cols)
    if sub_df.shape[1] > 1:
        plt.figure(figsize=(6, 5))
        sns.heatmap(
            sub_df.corr(), annot=True, fmt=".2f", cmap="coolwarm",
            square=True, cbar=True, annot_kws={"size": 8}
        )
        plt.title(f"Correlation Heatmap: {group_name}")
        plt.tight_layout()
        plt.show()

### 🕸 Radar Chart (Normalized Per Group)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import polars as pl

def plot_radar(df_polars, tickers, features, title="Radar Chart"):
    # Convert to pandas and filter relevant tickers
    df = df_polars.filter(pl.col("ticker").is_in(tickers)).select(["ticker"] + features).to_pandas()
    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features)
    if df.empty:
        print(f"⚠️ No valid data for: {title}")
        return

    # Remove constant-value features
    variable_features = [f for f in features if df[f].nunique(dropna=True) > 1]
    if len(variable_features) < 2:
        print(f"⚠️ Not enough variable features for radar: {title}")
        return

    # Normalize each feature to [0, 1]
    df_norm = df.set_index("ticker")
    df_norm = (df_norm[variable_features] - df_norm[variable_features].min()) / \
              (df_norm[variable_features].max() - df_norm[variable_features].min())

    # Angles for radar chart
    num_vars = len(variable_features)
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]  # close the loop

    # Start radar plot
    fig, ax = plt.subplots(figsize=(7, 7), subplot_kw=dict(polar=True))

    for ticker in df_norm.index.unique():
        row = df_norm.loc[ticker]
        if isinstance(row, pd.DataFrame):
            row = row.iloc[0]  # pick the first row if duplicates exist
        values = row.values.tolist()
        values += values[:1]  # close the loop
        if len(values) != len(angles):
            print(f"⚠️ Skipping {ticker} due to mismatch (values={len(values)}, angles={len(angles)}).")
            continue
        ax.plot(angles, values, label=ticker)
        ax.fill(angles, values, alpha=0.1)

    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)
    ax.set_thetagrids(np.degrees(angles[:-1]), [f.replace("_", " ").title() for f in variable_features])
    ax.set_ylim(0, 1)
    ax.set_title(title, size=14)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))
    plt.tight_layout()
    plt.show()


def get_top_complete_tickers(pdf, features, top_n=6):
    df = pdf.select(["ticker"] + features).drop_nulls()
    counts = df.group_by("ticker").len().sort("len", descending=True)
    return counts.head(top_n)["ticker"].to_list()

# Example metric groups (customize as needed)
metric_groups = {
    "Dividend Strength": ["dividend_yield", "dividend_cagr_3y", "dividend_cagr_5y", "yield_vs_5y_median"],
    "Valuation": ["pe_ratio", "pfcf_ratio", "payout_ratio"],
    "Growth": ["eps_cagr_3y", "fcf_cagr_3y", "net_debt_to_ebitda", "ebit_interest_cover"]
}

# Plot radar for each group
for group_name, features in metric_groups.items():
    top_tickers = get_top_complete_tickers(df, features, top_n=6)
    plot_radar(df, top_tickers, features, title=f"{group_name}: Top {len(top_tickers)} Tickers")


### 📉 Raw Feature Values per Ticker

In [None]:
def plot_raw(df, tickers, features, title="Raw Feature Values"):
    df_sub = df[df["ticker"].isin(tickers)]

    # Ensure latest data per ticker
    if "as_of" in df_sub.columns:
        df_sub = (
            df_sub.sort_values("as_of", ascending=False)
                  .drop_duplicates("ticker", keep="first")
                  .set_index("ticker")
        )
    else:
        print("⚠️ 'as_of' column missing. Skipping latest filter.")
        df_sub = df_sub.set_index("ticker")

    df_sub = df_sub[features].replace([np.inf, -np.inf], np.nan).dropna()

    if df_sub.empty:
        print(f"⚠️ Skipping empty data for: {tickers}")
        return

    df_sub = df_sub.sort_index()
    ax = df_sub.plot(kind="bar", figsize=(12, 6), colormap="tab10")
    ax.set_title(title)
    ax.set_ylabel("Raw Value")
    ax.set_xlabel("Ticker")
    ax.set_xticklabels(df_sub.index, rotation=0, ha="center")
    ax.legend(title="Feature", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()




ENABLE_RAW_PLOTS = True

if ENABLE_RAW_PLOTS:
    for group_name, features in metric_groups.items():
        for i in range(0, len(TICKERS_TO_LOAD), 5):
            batch = TICKERS_TO_LOAD[i:i + 5]
            plot_raw(pdf.copy(), batch, features, title=f"{group_name}: Raw Feature Comparison")