# EDA of Master Features Table

This notebook performs an exploratory data analysis on the `master_features` table
generated by the feature engineering pipeline. The goal is to understand the
characteristics of the features, their distributions, relationships, and how they
relate to the identified market regimes.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
import yaml
import warnings

PROJECT_ROOT_PATH = Path.cwd().parent 
if str(PROJECT_ROOT_PATH) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT_PATH))
if str(PROJECT_ROOT_PATH / "src") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT_PATH / "src"))

from regime_predictor_lib.utils.database_manager import DatabaseManager

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning) 
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6) 
plt.rcParams['savefig.dpi'] = 300 
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

## 1. Setup & Data Loading

In [None]:
DB_NAME = "quant.db"
DB_VOLUME_PATH = PROJECT_ROOT_PATH / "data" / "db" / "volume"
DB_PATH = DB_VOLUME_PATH / DB_NAME
MASTER_FEATURES_TABLE_NAME = "master_features"
COLUMN_ANALYSIS_YAML_PATH = PROJECT_ROOT_PATH / "data" / "processed" / "master_features_column_analysis.yaml"

EDA_REPORTS_DIR = PROJECT_ROOT_PATH / "data" / "reports" / "eda" / "master_features"
EDA_REPORTS_DIR.mkdir(parents=True, exist_ok=True)

COV_CORR_DIR = EDA_REPORTS_DIR / "covariance_correlation"
COV_CORR_DIR.mkdir(parents=True, exist_ok=True)

UNIVARIATE_DIR = EDA_REPORTS_DIR / "univariate_analysis"
UNIVARIATE_DIR.mkdir(parents=True, exist_ok=True)

VINTAGE_LAG_DIR = EDA_REPORTS_DIR / "vintage_lag_analysis"
VINTAGE_LAG_DIR.mkdir(parents=True, exist_ok=True)

db_manager = DatabaseManager(db_path=DB_PATH)

try:
    engine = db_manager.engine
    query = f"SELECT * FROM {MASTER_FEATURES_TABLE_NAME} ORDER BY date ASC"
    mf_df = pd.read_sql_query(sql=sqlalchemy.text(query), con=engine, parse_dates=['date'])
    mf_df.set_index('date', inplace=True)
    print(f"Successfully loaded '{MASTER_FEATURES_TABLE_NAME}' table. Shape: {mf_df.shape}")
except Exception as e:
    print(f"Error loading '{MASTER_FEATURES_TABLE_NAME}' table: {e}")
    mf_df = pd.DataFrame()

column_analysis = None
if COLUMN_ANALYSIS_YAML_PATH.exists():
    try:
        with open(COLUMN_ANALYSIS_YAML_PATH, 'r') as f:
            column_analysis = yaml.safe_load(f)
        print(f"Successfully loaded column analysis from: {COLUMN_ANALYSIS_YAML_PATH.name}")
    except Exception as e:
        print(f"Error loading column analysis YAML: {e}")
else:
    print(f"Column analysis YAML not found at: {COLUMN_ANALYSIS_YAML_PATH}")

## 2. Initial Overview & Sanity Checks

In [None]:
if not mf_df.empty:
    print("--- Head (first 5 rows) ---")
    display(mf_df.head())
    print(f"\nDataFrame shape: {mf_df.shape}")
    
    print("\n--- Info (condensed) ---")
    mf_df.info(verbose=False) 

    print("\n--- Saving full .describe() to CSV ---")
    describe_df = mf_df.describe(include='all').transpose()
    describe_df.to_csv(EDA_REPORTS_DIR / "master_features_describe_full.csv")
    print(f"Full describe saved to: {EDA_REPORTS_DIR / 'master_features_describe_full.csv'}")
    print("Sample of describe:")
    display(describe_df.head())
else:
    print("Master features DataFrame is empty. Cannot perform EDA.")

In [None]:
if not mf_df.empty:
    nan_summary_after_imputation = mf_df.isnull().sum().sort_values(ascending=False)
    nan_percentage_after_imputation = (mf_df.isnull().sum() / len(mf_df) * 100).sort_values(ascending=False)
    
    nan_df_summary = pd.DataFrame({
        'NaN Count': nan_summary_after_imputation,
        'NaN Percentage': nan_percentage_after_imputation
    })
    nan_df_summary.to_csv(EDA_REPORTS_DIR / "master_features_nan_summary.csv")
    print(f"\nFull NaN summary saved to: {EDA_REPORTS_DIR / 'master_features_nan_summary.csv'}")
    
    print("\n--- NaN Summary (Top 20 with NaNs) ---")
    display(nan_df_summary[nan_df_summary['NaN Count'] > 0].head(20))

    numeric_cols = mf_df.select_dtypes(include=np.number).columns.tolist()
    object_cols = mf_df.select_dtypes(include='object').columns.tolist()
    print(f"\nIdentified {len(numeric_cols)} numeric columns.")
    print(f"Identified {len(object_cols)} object/categorical columns (sample): {object_cols[:10] if object_cols else 'None'}...")
else:
    print("Master features DataFrame is empty.")

## 3. Target Variable Analysis (`regime_t`, `regime_t_plus_6m`)

In [None]:
target_cols = ['regime_t', 'regime_t_plus_6m']
if not mf_df.empty and all(col in mf_df.columns for col in target_cols):
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    sns.countplot(x='regime_t', data=mf_df.dropna(subset=['regime_t']), ax=axes[0], palette='viridis')
    axes[0].set_title('Distribution of Current Regime (regime_t)')
    axes[0].set_xlabel('Regime State')
    axes[0].set_ylabel('Count')

    sns.countplot(x='regime_t_plus_6m', data=mf_df.dropna(subset=['regime_t_plus_6m']), ax=axes[1], palette='viridis')
    axes[1].set_title('Distribution of Future Regime (regime_t_plus_6m)')
    axes[1].set_xlabel('Regime State')
    axes[1].set_ylabel('Count')
    plt.tight_layout()
    plt.savefig(EDA_REPORTS_DIR / "target_variable_distributions.png")
    plt.show()

    df_for_transition = mf_df[target_cols].dropna()
    if not df_for_transition.empty:
        df_for_transition['regime_t'] = df_for_transition['regime_t'].astype(int)
        df_for_transition['regime_t_plus_6m'] = df_for_transition['regime_t_plus_6m'].astype(int)

        transition_counts = pd.crosstab(df_for_transition['regime_t'], df_for_transition['regime_t_plus_6m'])
        transition_probs = transition_counts.apply(lambda r: r/r.sum(), axis=1)
        
        transition_probs.to_csv(EDA_REPORTS_DIR / "regime_transition_probabilities.csv")
        print(f"Regime transition probabilities saved to: {EDA_REPORTS_DIR / 'regime_transition_probabilities.csv'}")

        plt.figure(figsize=(8, 6))
        sns.heatmap(transition_probs, annot=True, fmt=".2f", cmap="Blues")
        plt.title('Transition Probabilities: regime_t to regime_t_plus_6m')
        plt.xlabel('Regime (t + 6m)')
        plt.ylabel('Regime (t)')
        plt.savefig(EDA_REPORTS_DIR / "regime_transition_heatmap.png")
        plt.show()
    else:
        print("Not enough data to compute transition matrix.")
else:
    print(f"Target columns ({', '.join(target_cols)}) not found or DataFrame is empty.")

## 4. Global Feature Statistics

In [None]:
PREFIX_THEME_MAP_FOR_MASTER = {
    "ti_gspc_": "1. Technicals (S&P500)",
    "vol_": "2. Volatility & Stress",
    "pcr_": "3. Market Internals (PCR)",
    "breadth_": "3. Market Internals (Breadth)",
    "stk_bond_diff_": "4. Intermarket (Stock/Bond Diff)",
    "spy_tlt_ratio_": "4. Intermarket (SPY/TLT Ratio)", 
    "gold_silver_ratio_": "4. Intermarket (Gold/Silver Ratio)",
    "copper_gold_ratio_": "4. Intermarket (Copper/Gold Ratio)",
    "junk_spread_": "5. Credit & Bonds (Junk Spread)",
    "corp_oas_": "5. Credit & Bonds (Corp OAS)",
    "tsy_spread_": "5. Credit & Bonds (Treasury Spread)",
    "em_tbill_spread_": "5. Credit & Bonds (EM/T-Bill Spread)",
    "fg_": "6. Sentiment (Fear/Greed)",
    "conf_": "6. Sentiment (Consumer Conf.)",
    "aaii_": "6. Sentiment (AAII)",
    "finra_": "6. Sentiment (FINRA Margin)",
    "sentconf_": "6. Sentiment (SMCI/DMCI)",
    "nfp_": "7. Macro (NFP)",
    "icj_": "7. Macro (Jobless Claims)",
    "cpi_": "7. Macro (CPI)",
    "retail_": "7. Macro (Retail Sales)",
    "m2_": "7. Macro (M2 Supply)",
    "houst_": "7. Macro (Housing Starts)",
    "hpi_": "7. Macro (Housing Prices)",
    "smi_": "8. Market Structure (SMI)",
    "djt_vs_gspc_": "9. Sector/Micro (DJT/GSPC)", 
    "rut_vs_gspc_": "9. Sector/Micro (RUT/GSPC)",
    "qqq_vs_dju_": "9. Sector/Micro (QQQ/DJU)",
    "xlv_vs_gspc_": "9. Sector/Micro (XLV/GSPC)",
    "dxy_": "10. Global & Currency (DXY)",
    "em_": "10. Global & Currency (EM Equity)",
    "oil_": "10. Global & Currency (Oil)",
    "bdi_": "10. Global & Currency (BDI)",
    "gex_": "11. Derivatives (GEX)",
    "sp500_": "0. S&P500 Base", 
}

def get_theme_for_column(col_name, prefix_map):
    sorted_prefixes = sorted(prefix_map.keys(), key=len, reverse=True)
    for prefix in sorted_prefixes:
        if col_name.startswith(prefix):
            return prefix_map[prefix]
    if col_name.startswith("regime_"):
        return "Market Regime Info"
    return "Uncategorized"

if not mf_df.empty:
    mf_df_numeric = mf_df[numeric_cols] 
    
    features_by_theme = {}
    for col in mf_df_numeric.columns:
        theme = get_theme_for_column(col, PREFIX_THEME_MAP_FOR_MASTER)
        if theme not in features_by_theme:
            features_by_theme[theme] = []
        features_by_theme[theme].append(col)
    
    print("Feature counts per identified theme:")
    for theme, cols in features_by_theme.items():
        print(f"- {theme}: {len(cols)} features")
    
    themes_for_intra_analysis = {theme: cols for theme, cols in features_by_theme.items() if len(cols) >= 2}
    print(f"\nThemes with >=2 features for intra-analysis: {len(themes_for_intra_analysis)}")

else:
    print("mf_df is empty. Cannot categorize features by theme.")
    features_by_theme = {}
    themes_for_intra_analysis = {}

### 4.05 Intra-Category Covariance & Correlation Matrices
For each thematic group of features, we'll compute and save their internal covariance and correlation matrices.

In [None]:
INTRA_CATEGORY_DIR = COV_CORR_DIR / "by_theme"
INTRA_CATEGORY_DIR.mkdir(parents=True, exist_ok=True)

if not mf_df.empty and themes_for_intra_analysis:
    print(f"\n--- Generating Intra-Category Covariance & Correlation Matrices ---")
    for theme, theme_cols in themes_for_intra_analysis.items():
        print(f"  Processing theme: {theme} ({len(theme_cols)} features)")
        
        safe_theme_name = theme.replace("&", "and").replace("/", "_").replace(" ", "_").lower()
        theme_output_dir = INTRA_CATEGORY_DIR / safe_theme_name
        theme_output_dir.mkdir(parents=True, exist_ok=True)

        temp_df_theme = mf_df_numeric[theme_cols].copy()
        
        temp_df_theme.dropna(axis=1, how='all', inplace=True)
        theme_variances = temp_df_theme.var(ddof=0)
        theme_cols_with_variance = theme_variances[theme_variances > 1e-9].index.tolist()
        
        if len(theme_cols_with_variance) < 2:
            print(f"    Skipping theme '{theme}' due to < 2 features with variance.")
            continue
            
        temp_df_theme_final = temp_df_theme[theme_cols_with_variance]

        cov_matrix_theme = temp_df_theme_final.cov()
        cov_matrix_theme.to_csv(theme_output_dir / f"{safe_theme_name}_covariance_matrix.csv")
        
        fig_width = max(10, len(theme_cols_with_variance) * 0.4)
        fig_height = max(8, len(theme_cols_with_variance) * 0.4)
        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(cov_matrix_theme, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
        plt.title(f'Covariance Matrix: {theme}', fontsize=16)
        plt.xticks(fontsize=8, rotation=90)
        plt.yticks(fontsize=8, rotation=0)
        plt.tight_layout()
        plt.savefig(theme_output_dir / f"{safe_theme_name}_covariance_heatmap.png")
        plt.close()

        corr_matrix_theme = temp_df_theme_final.corr()
        corr_matrix_theme.to_csv(theme_output_dir / f"{safe_theme_name}_correlation_matrix.csv")

        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(corr_matrix_theme, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, fmt=".2f", linewidths=.5)
        plt.title(f'Correlation Matrix: {theme}', fontsize=16)
        plt.xticks(fontsize=8, rotation=90)
        plt.yticks(fontsize=8, rotation=0)
        plt.tight_layout()
        plt.savefig(theme_output_dir / f"{safe_theme_name}_correlation_heatmap.png")
        plt.close()
        
    print("Intra-category matrices and heatmaps saved.")
else:
    print("DataFrame is empty or no themes identified for intra-category analysis.")

### 4.1 Covariance Matrix


In [None]:
if not mf_df.empty and numeric_cols:
    print(f"Calculating Covariance Matrix for ALL {len(numeric_cols)} numeric features...")
    
    temp_df_for_cov = mf_df[numeric_cols].dropna(axis=1, how='all')
    variances = temp_df_for_cov.var(ddof=0) 
    cols_with_variance = variances[variances > 1e-9].index.tolist() 
    
    if not cols_with_variance:
        print("No numeric columns with sufficient variance found for covariance matrix.")
    else:
        temp_df_for_cov_final = temp_df_for_cov[cols_with_variance]
        print(f"Using {len(cols_with_variance)} columns for covariance matrix.")

        covariance_matrix_full = temp_df_for_cov_final.cov()
        covariance_matrix_full.to_csv(COV_CORR_DIR / "full_covariance_matrix.csv")
        print(f"Full covariance matrix saved to: {COV_CORR_DIR / 'full_covariance_matrix.csv'}")

        fig_width = max(20, len(cols_with_variance) * 0.25) 
        fig_height = max(18, len(cols_with_variance) * 0.25)
        
        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(covariance_matrix_full, annot=False, cmap='coolwarm', fmt=".1f")
        plt.title('Covariance Matrix of Numerical Features', fontsize=20)
        plt.xticks(fontsize=6, rotation=90) 
        plt.yticks(fontsize=6, rotation=0)
        plt.tight_layout()
        plt.savefig(COV_CORR_DIR / "covariance_matrix_heatmap_full.png", dpi=300)
        plt.close() 
        print(f"Covariance matrix heatmap saved to: {COV_CORR_DIR / 'covariance_matrix_heatmap_full.png'}")

        print("\n--- Top 50 Highest Absolute Covariance Pairs (excluding self-covariance) ---")
        cov_unstacked = covariance_matrix_full.unstack()
        cov_unstacked_sorted_abs = cov_unstacked[cov_unstacked.index.get_level_values(0) < cov_unstacked.index.get_level_values(1)].abs().sort_values(ascending=False)
        top_abs_cov_pairs_signed = cov_unstacked.loc[cov_unstacked_sorted_abs.head(50).index]
        top_abs_cov_pairs_signed.to_csv(COV_CORR_DIR / "top_absolute_covariance_pairs.csv", header=['covariance'])
        display(top_abs_cov_pairs_signed.head(20))
else:
    print("No numeric columns available or DataFrame is empty for covariance matrix.")

### 4.2 Correlation Matrix

In [None]:
if not mf_df.empty and numeric_cols:
    print(f"\nCalculating Correlation Matrix for ALL {len(numeric_cols)} numeric features...")
    if 'cols_with_variance' in locals() and cols_with_variance: 
        temp_df_for_corr = mf_df[cols_with_variance]
        correlation_matrix_full = temp_df_for_corr.corr()
        correlation_matrix_full.to_csv(COV_CORR_DIR / "full_correlation_matrix.csv")
        print(f"Full correlation matrix saved to: {COV_CORR_DIR / 'full_correlation_matrix.csv'}")

        fig_width = max(20, len(cols_with_variance) * 0.25)
        fig_height = max(18, len(cols_with_variance) * 0.25)

        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(correlation_matrix_full, annot=False, cmap='coolwarm', center=0, vmin=-1, vmax=1, fmt=".1f")
        plt.title('Correlation Matrix of Numerical Features', fontsize=20)
        plt.xticks(fontsize=6, rotation=90)
        plt.yticks(fontsize=6, rotation=0)
        plt.tight_layout()
        plt.savefig(COV_CORR_DIR / "correlation_matrix_heatmap_full.png", dpi=300)
        plt.close()
        print(f"Correlation matrix heatmap saved to: {COV_CORR_DIR / 'correlation_matrix_heatmap_full.png'}")

        print("\n--- Top 50 Most Positively Correlated Pairs (excluding self-correlation) ---")
        corr_unstacked = correlation_matrix_full.unstack()
        corr_unstacked_filtered = corr_unstacked[corr_unstacked.index.get_level_values(0) < corr_unstacked.index.get_level_values(1)]
        
        top_pos_corr = corr_unstacked_filtered.sort_values(ascending=False).head(50)
        top_pos_corr.to_csv(COV_CORR_DIR / "top_positive_correlation_pairs.csv", header=['correlation'])
        display(top_pos_corr.head(20))

        print("\n--- Top 50 Most Negatively Correlated Pairs ---")
        top_neg_corr = corr_unstacked_filtered.sort_values(ascending=True).head(50)
        top_neg_corr.to_csv(COV_CORR_DIR / "top_negative_correlation_pairs.csv", header=['correlation'])
        display(top_neg_corr.head(20))
    else:
        print("No numeric columns with sufficient variance found for correlation matrix.")
else:
    print("No numeric columns available or DataFrame is empty for correlation matrix.")

### 4.3 Missing Data Patterns (Post-Imputation)

In [None]:
if not mf_df.empty:
    fig_width_missing = max(20, mf_df.shape[1] * 0.1) 
    fig_height_missing = max(10, mf_df.shape[0] * 0.01) 
    fig_height_missing = min(fig_height_missing, 20) 

    plt.figure(figsize=(fig_width_missing, fig_height_missing))
    sns.heatmap(mf_df.isnull(), cbar=False, cmap='viridis', yticklabels=False) 
    plt.title('Missing Data Pattern in Master Features (Yellow=Missing)', fontsize=16)
    plt.ylabel('Date (Time Series Order)')
    plt.xlabel('Features')
    plt.xticks(fontsize=5, rotation=90) 
    plt.tight_layout()
    plt.savefig(EDA_REPORTS_DIR / "missing_data_heatmap_full.png", dpi=300)
    plt.close()
    print(f"Full missing data heatmap saved to: {EDA_REPORTS_DIR / 'missing_data_heatmap_full.png'}")
else:
    print("DataFrame is empty, skipping missing data pattern visualization.")

### 4.4 Inter-Category Average Absolute Correlation Matrix
This matrix will show the average absolute correlation *between* features of different thematic groups.
Diagonal elements represent the average absolute *intra*-category correlation.

In [None]:
INTER_CATEGORY_SUMMARY_DIR = COV_CORR_DIR / "inter_category_summary"
INTER_CATEGORY_SUMMARY_DIR.mkdir(parents=True, exist_ok=True)

if not mf_df.empty and themes_for_intra_analysis: 
    print(f"\n--- Generating Inter-Category Average Absolute Correlation Matrix ---")
    
    if 'correlation_matrix_full' not in locals() or correlation_matrix_full.empty:
        print("Full correlation matrix not available. Recomputing for inter-category analysis.")
        temp_df_for_corr = mf_df_numeric[cols_with_variance] 
        if not temp_df_for_corr.empty:
            correlation_matrix_full = temp_df_for_corr.corr()
        else:
            print("Cannot compute full correlation matrix. Aborting inter-category analysis.")
            correlation_matrix_full = pd.DataFrame()


    if not correlation_matrix_full.empty:
        category_names = sorted(list(themes_for_intra_analysis.keys()))
        avg_abs_corr_matrix = pd.DataFrame(index=category_names, columns=category_names, dtype=float)

        for i, theme1_name in enumerate(category_names):
            for j, theme2_name in enumerate(category_names):
                if j < i: 
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = avg_abs_corr_matrix.loc[theme2_name, theme1_name]
                    continue

                cols_theme1 = [col for col in themes_for_intra_analysis[theme1_name] if col in correlation_matrix_full.columns]
                cols_theme2 = [col for col in themes_for_intra_analysis[theme2_name] if col in correlation_matrix_full.columns]

                if not cols_theme1 or not cols_theme2:
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = np.nan
                    continue
                
                if theme1_name == theme2_name: 
                    if len(cols_theme1) < 2:
                        avg_abs_corr_matrix.loc[theme1_name, theme1_name] = np.nan 
                        continue
                    sub_corr_matrix = correlation_matrix_full.loc[cols_theme1, cols_theme1]
                    upper_triangle_mask = np.triu(np.ones(sub_corr_matrix.shape), k=1).astype(bool)
                    relevant_corrs = sub_corr_matrix.where(upper_triangle_mask).stack().abs()
                else: 
                    sub_corr_matrix = correlation_matrix_full.loc[cols_theme1, cols_theme2]
                    relevant_corrs = sub_corr_matrix.stack().abs()
                
                if not relevant_corrs.empty:
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = relevant_corrs.mean()
                else:
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = np.nan
        
        avg_abs_corr_matrix.to_csv(INTER_CATEGORY_SUMMARY_DIR / "inter_category_avg_abs_correlation.csv")
        print(f"Inter-category average absolute correlation matrix saved to CSV.")

        plt.figure(figsize=(max(12, len(category_names)*0.8), max(10, len(category_names)*0.7)))
        sns.heatmap(avg_abs_corr_matrix.astype(float), annot=True, cmap="viridis", fmt=".2f", linewidths=.5)
        plt.title('Inter-Category Average Absolute Correlation', fontsize=16)
        plt.xticks(fontsize=10, rotation=90)
        plt.yticks(fontsize=10, rotation=0)
        plt.tight_layout()
        plt.savefig(INTER_CATEGORY_SUMMARY_DIR / "inter_category_avg_abs_correlation_heatmap.png")
        plt.close()
        print(f"Inter-category average absolute correlation heatmap saved.")
        
        print("\n--- Inter-Category Average Absolute Correlation Matrix (Sample) ---")
        display(avg_abs_corr_matrix.head())

else:
    print("DataFrame is empty or no themes for analysis, skipping inter-category correlations.")

In [None]:
PREFIX_THEME_MAP_FOR_MASTER = {
    "ti_gspc_": "1. Technicals (S&P500)",
    "vol_": "2. Volatility & Stress",
    "pcr_": "3. Market Internals (PCR)",
    "breadth_": "3. Market Internals (Breadth)",
    "stk_bond_diff_": "4. Intermarket (Stock/Bond Diff)",
    "spy_tlt_ratio_": "4. Intermarket (SPY/TLT Ratio)",
    "gold_silver_ratio_": "4. Intermarket (Gold/Silver Ratio)",
    "copper_gold_ratio_": "4. Intermarket (Copper/Gold Ratio)",
    "junk_spread_": "5. Credit & Bonds (Junk Spread)",
    "corp_oas_": "5. Credit & Bonds (Corp OAS)",
    "tsy_spread_": "5. Credit & Bonds (Treasury Spread)",
    "em_tbill_spread_": "5. Credit & Bonds (EM/T-Bill Spread)",
    "fg_": "6. Sentiment (Fear/Greed)",
    "conf_": "6. Sentiment (Consumer Conf.)",
    "aaii_": "6. Sentiment (AAII)",
    "finra_": "6. Sentiment (FINRA Margin)",
    "sentconf_": "6. Sentiment (SMCI/DMCI)",
    "nfp_": "7. Macro (NFP)",
    "icj_": "7. Macro (Jobless Claims)",
    "cpi_": "7. Macro (CPI)",
    "retail_": "7. Macro (Retail Sales)",
    "m2_": "7. Macro (M2 Supply)",
    "houst_": "7. Macro (Housing Starts)",
    "hpi_": "7. Macro (Housing Prices)",
    "smi_": "8. Market Structure (SMI)",
    "djt_vs_gspc_": "9. Sector/Micro (DJT/GSPC)",
    "rut_vs_gspc_": "9. Sector/Micro (RUT/GSPC)",
    "qqq_vs_dju_": "9. Sector/Micro (QQQ/DJU)",
    "xlv_vs_gspc_": "9. Sector/Micro (XLV/GSPC)",
    "dxy_": "10. Global & Currency (DXY)",
    "em_": "10. Global & Currency (EM Equity)",
    "oil_": "10. Global & Currency (Oil)",
    "bdi_": "10. Global & Currency (BDI)",
    "gex_": "11. Derivatives (GEX)", 
    "sp500_": "0. S&P500 Base",
}

def get_theme_for_column(col_name, prefix_map):
    sorted_prefixes = sorted(prefix_map.keys(), key=len, reverse=True)
    for prefix in sorted_prefixes:
        if col_name.startswith(prefix):
            return prefix_map[prefix]
    if col_name.startswith("regime_"):
        return "Market Regime Info"
    return "Uncategorized"

if not mf_df.empty:
    mf_df_numeric = mf_df[numeric_cols] 
    
    features_by_theme = {}
    for col in mf_df_numeric.columns:
        theme = get_theme_for_column(col, PREFIX_THEME_MAP_FOR_MASTER)
        if theme not in features_by_theme:
            features_by_theme[theme] = []
        features_by_theme[theme].append(col)
    
    print("Feature counts per identified theme:")
    for theme, cols in features_by_theme.items():
        print(f"- {theme}: {len(cols)} features")
    
    themes_for_intra_analysis = {theme: cols for theme, cols in features_by_theme.items() if len(cols) >= 2}
    print(f"\nThemes with >=2 features for intra-analysis: {len(themes_for_intra_analysis)}")

else:
    print("mf_df is empty. Cannot categorize features by theme.")
    features_by_theme = {}
    themes_for_intra_analysis = {}

In [None]:
INTRA_CATEGORY_DIR = COV_CORR_DIR / "by_theme"
INTRA_CATEGORY_DIR.mkdir(parents=True, exist_ok=True)

if not mf_df.empty and themes_for_intra_analysis:
    print(f"\n--- Generating Intra-Category Covariance & Correlation Matrices ---")
    for theme_name, theme_cols in themes_for_intra_analysis.items(): 
        print(f"  Processing theme: {theme_name} ({len(theme_cols)} features)")
        
        sanitized_theme_name_for_path = theme_name.replace("(", "").replace(")", "").replace("/", "_").replace(".", "").replace("&", "and").replace(" ", "_").lower()
        
        theme_output_dir = INTRA_CATEGORY_DIR / sanitized_theme_name_for_path
        theme_output_dir.mkdir(parents=True, exist_ok=True)

        temp_df_theme = mf_df_numeric[theme_cols].copy()
        
        temp_df_theme.dropna(axis=1, how='all', inplace=True)
        theme_variances = temp_df_theme.var(ddof=0)
        theme_cols_with_variance = theme_variances[theme_variances > 1e-9].index.tolist()
        
        if len(theme_cols_with_variance) < 2:
            print(f"    Skipping theme '{theme_name}' due to < 2 features with variance.")
            continue
            
        temp_df_theme_final = temp_df_theme[theme_cols_with_variance]

        cov_matrix_theme = temp_df_theme_final.cov()
        cov_matrix_theme.to_csv(theme_output_dir / f"{sanitized_theme_name_for_path}_covariance_matrix.csv")
        
        fig_width = max(10, len(theme_cols_with_variance) * 0.4)
        fig_height = max(8, len(theme_cols_with_variance) * 0.4)
        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(cov_matrix_theme, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
        plt.title(f'Covariance Matrix: {theme_name}', fontsize=16) 
        plt.xticks(fontsize=8, rotation=90)
        plt.yticks(fontsize=8, rotation=0)
        plt.tight_layout()
        plt.savefig(theme_output_dir / f"{sanitized_theme_name_for_path}_covariance_heatmap.png")
        plt.close()

        corr_matrix_theme = temp_df_theme_final.corr()
        corr_matrix_theme.to_csv(theme_output_dir / f"{sanitized_theme_name_for_path}_correlation_matrix.csv")

        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(corr_matrix_theme, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, fmt=".2f", linewidths=.5)
        plt.title(f'Correlation Matrix: {theme_name}', fontsize=16)
        plt.xticks(fontsize=8, rotation=90)
        plt.yticks(fontsize=8, rotation=0)
        plt.tight_layout()
        plt.savefig(theme_output_dir / f"{sanitized_theme_name_for_path}_correlation_heatmap.png")
        plt.close()
        
    print("Intra-category matrices and heatmaps saved.")
else:
    print("DataFrame is empty or no themes identified for intra-category analysis.")

In [None]:
INTER_CATEGORY_SUMMARY_DIR = COV_CORR_DIR / "inter_category_summary"
INTER_CATEGORY_SUMMARY_DIR.mkdir(parents=True, exist_ok=True)

if not mf_df.empty and themes_for_intra_analysis:
    print(f"\n--- Generating Inter-Category Average Absolute Correlation Matrix ---")
    
    if 'correlation_matrix_full' not in locals() or correlation_matrix_full.empty:
        print("Full correlation matrix not available. Recomputing for inter-category analysis.")
        if 'cols_with_variance' in locals() and cols_with_variance:
            temp_df_for_corr = mf_df_numeric[cols_with_variance]
            if not temp_df_for_corr.empty:
                correlation_matrix_full = temp_df_for_corr.corr()
            else:
                print("Cannot compute full correlation matrix. Aborting inter-category analysis.")
                correlation_matrix_full = pd.DataFrame()
        else:
             print("cols_with_variance not defined. Cannot compute full correlation matrix. Aborting inter-category analysis.")
             correlation_matrix_full = pd.DataFrame()


    if not correlation_matrix_full.empty:
        category_names_for_matrix = sorted(list(themes_for_intra_analysis.keys()))
        avg_abs_corr_matrix = pd.DataFrame(index=category_names_for_matrix, columns=category_names_for_matrix, dtype=float)

        for i, theme1_name in enumerate(category_names_for_matrix):
            for j, theme2_name in enumerate(category_names_for_matrix):
                if j < i: 
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = avg_abs_corr_matrix.loc[theme2_name, theme1_name]
                    continue

                cols_theme1 = [col for col in themes_for_intra_analysis[theme1_name] if col in correlation_matrix_full.columns]
                cols_theme2 = [col for col in themes_for_intra_analysis[theme2_name] if col in correlation_matrix_full.columns]

                if not cols_theme1 or not cols_theme2:
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = np.nan
                    continue
                
                if theme1_name == theme2_name:
                    if len(cols_theme1) < 2:
                        avg_abs_corr_matrix.loc[theme1_name, theme1_name] = np.nan
                        continue
                    sub_corr_matrix = correlation_matrix_full.loc[cols_theme1, cols_theme1]
                    upper_triangle_mask = np.triu(np.ones(sub_corr_matrix.shape), k=1).astype(bool)
                    relevant_corrs = sub_corr_matrix.where(upper_triangle_mask).stack().abs()
                else: 
                    sub_corr_matrix = correlation_matrix_full.loc[cols_theme1, cols_theme2]
                    relevant_corrs = sub_corr_matrix.stack().abs()
                
                if not relevant_corrs.empty:
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = relevant_corrs.mean()
                else:
                    avg_abs_corr_matrix.loc[theme1_name, theme2_name] = np.nan
        
        avg_abs_corr_matrix.to_csv(INTER_CATEGORY_SUMMARY_DIR / "inter_category_avg_abs_correlation.csv")
        print(f"Inter-category average absolute correlation matrix saved to CSV.")

        plt.figure(figsize=(max(12, len(category_names_for_matrix)*0.8), max(10, len(category_names_for_matrix)*0.7)))
        sns.heatmap(avg_abs_corr_matrix.astype(float), annot=True, cmap="viridis", fmt=".2f", linewidths=.5)
        plt.title('Inter-Category Average Absolute Correlation', fontsize=16)
        plt.xticks(fontsize=10, rotation=90)
        plt.yticks(fontsize=10, rotation=0)
        plt.tight_layout()
        plt.savefig(INTER_CATEGORY_SUMMARY_DIR / "inter_category_avg_abs_correlation_heatmap.png")
        plt.close()
        print(f"Inter-category average absolute correlation heatmap saved.")
        
        print("\n--- Inter-Category Average Absolute Correlation Matrix (Sample) ---")
        display(avg_abs_corr_matrix.head())
else:
    print("DataFrame is empty or no themes for analysis, skipping inter-category correlations.")

In [None]:
if not mf_df.empty and numeric_cols:
    print(f"\n--- Generating Univariate Analysis for ALL {len(numeric_cols)} Numeric Columns ---")

    unique_themes_for_dirs = set()
    for col in numeric_cols:
        theme = get_theme_for_column(col, PREFIX_THEME_MAP_FOR_MASTER)
        sanitized_theme_name_for_path = theme.replace("(", "").replace(")", "").replace("/", "_").replace(".", "").replace("&", "and").replace(" ", "_").lower()
        unique_themes_for_dirs.add(sanitized_theme_name_for_path)
    
    for sanitized_theme_name in unique_themes_for_dirs:
        (UNIVARIATE_DIR / sanitized_theme_name).mkdir(parents=True, exist_ok=True)


    all_feature_stats = []

    for i, col in enumerate(numeric_cols):
        print(f"Processing univariate plots for: {col} ({i+1}/{len(numeric_cols)})")
        
        theme = get_theme_for_column(col, PREFIX_THEME_MAP_FOR_MASTER)
        sanitized_theme_name_for_path = theme.replace("(", "").replace(")", "").replace("/", "_").replace(".", "").replace("&", "and").replace(" ", "_").lower()
        col_plot_dir = UNIVARIATE_DIR / sanitized_theme_name_for_path 
        
        feature_series = mf_df[col].dropna()
        if feature_series.empty:
            print(f"  Skipping {col} as it contains all NaNs.")
            all_feature_stats.append(pd.Series(name=col, dtype=float)) 
            continue

        all_feature_stats.append(feature_series.describe())

        plt.figure(figsize=(8, 5))
        sns.histplot(feature_series, kde=True)
        plt.title(f'Distribution of {col}')
        plt.savefig(col_plot_dir / f"{col}_distribution.png")
        plt.close()

        plt.figure(figsize=(15, 4))
        mf_df[col].plot()
        plt.title(f'Time Series of {col}')
        if 'regime_t' in mf_df.columns and not mf_df['regime_t'].dropna().empty:
            regime_data_aligned = mf_df['regime_t'].reindex(feature_series.index).ffill().bfill()
            if not regime_data_aligned.dropna().empty:
                unique_regimes = sorted(regime_data_aligned.dropna().unique().astype(int))
                palette = sns.color_palette("viridis", n_colors=max(3, len(unique_regimes)))
                for regime_idx, regime_val in enumerate(unique_regimes):
                    if regime_idx < len(palette):
                        color_for_regime = palette[regime_idx]
                    else: 
                        color_for_regime = "gray" 
                    
                    regime_periods = regime_data_aligned[regime_data_aligned == regime_val]
                    if not regime_periods.empty:
                        plt.fill_between(regime_periods.index, feature_series.min(), feature_series.max(), 
                                         where=(regime_data_aligned == regime_val), 
                                         color=color_for_regime, alpha=0.2, label=f'Regime {int(regime_val)}')
                if unique_regimes:
                    plt.legend(loc='upper left', bbox_to_anchor=(1,1))
        plt.tight_layout() 
        plt.savefig(col_plot_dir / f"{col}_timeseries.png")
        plt.close()

        if 'regime_t' in mf_df.columns and not mf_df['regime_t'].dropna().empty:
            df_for_boxplot = mf_df[[col, 'regime_t']].dropna()
            if not df_for_boxplot.empty and len(df_for_boxplot['regime_t'].unique()) > 1:
                plt.figure(figsize=(8, 5))
                sns.boxplot(x='regime_t', y=col, data=df_for_boxplot, palette='viridis')
                plt.title(f'{col} by Regime (regime_t)')
                plt.savefig(col_plot_dir / f"{col}_boxplot_regime.png")
                plt.close()

    if all_feature_stats:
        all_feature_stats_df = pd.concat(all_feature_stats, axis=1)
        all_feature_stats_df.to_csv(UNIVARIATE_DIR / "all_numeric_features_descriptive_stats.csv")
        print(f"\nDescriptive stats for all numeric features saved to: {UNIVARIATE_DIR / 'all_numeric_features_descriptive_stats.csv'}")
else:
    print("DataFrame is empty or no numeric columns, skipping univariate analysis.")

In [None]:
PREFIX_THEME_MAP_FOR_MASTER = {
    "sp500_": "0. S&P500 Base Features",

    "ti_gspc_": "1. Technical Trend & Momentum",

    "vol_": "2. Volatility & Market Stress", 

    "pcr_": "3. Market Internals",         
    "breadth_": "3. Market Internals",     

    "stk_bond_diff_": "4. Intermarket Relationships",
    "spy_tlt_ratio_": "4. Intermarket Relationships",
    "gold_silver_ratio_": "4. Intermarket Relationships",
    "copper_gold_ratio_": "4. Intermarket Relationships",
    "djt_vs_gspc_": "4. Intermarket Relationships", 
    "rut_vs_gspc_": "4. Intermarket Relationships", 

    "junk_spread_": "5. Credit & Bond Markets",
    "corp_oas_": "5. Credit & Bond Markets",
    "tsy_spread_": "5. Credit & Bond Markets", 
    "em_tbill_spread_": "5. Credit & Bond Markets",

    "fg_": "6. Sentiment & Behavior",        
    "conf_": "6. Sentiment & Behavior",      
    "aaii_": "6. Sentiment & Behavior",       
    "finra_": "6. Sentiment & Behavior",      
    "sentconf_": "6. Sentiment & Behavior",   

    "nfp_": "7. Macro Economic Data",
    "icj_": "7. Macro Economic Data",
    "cpi_": "7. Macro Economic Data",
    "retail_": "7. Macro Economic Data",
    "m2_": "7. Macro Economic Data",
    "houst_": "7. Macro Economic Data",
    "hpi_": "7. Macro Economic Data",

    "smi_": "8. Market Structure & Flows",    

    "qqq_vs_dju_": "9. Sector & Micro Tells", 
    "xlv_vs_gspc_": "9. Sector & Micro Tells",

    "dxy_": "10. Global & Currency",
    "em_": "10. Global & Currency",        
    "oil_": "10. Global & Currency",
    "bdi_": "10. Global & Currency",

    "gex_": "11. Derivatives Metrics",       
}

def get_theme_for_column(col_name, prefix_map):
    sorted_prefixes = sorted(prefix_map.keys(), key=len, reverse=True)
    for prefix in sorted_prefixes:
        if col_name.startswith(prefix):
            return prefix_map[prefix] 
    if col_name.startswith("regime_"):
        return "Market Regime Info"
    return "Uncategorized"

if not mf_df.empty:
    if 'numeric_cols' not in locals():
        numeric_cols = mf_df.select_dtypes(include=np.number).columns.tolist()
        
    mf_df_numeric = mf_df[numeric_cols] 
    
    features_by_theme = {}
    for col in mf_df_numeric.columns:
        theme = get_theme_for_column(col, PREFIX_THEME_MAP_FOR_MASTER)
        if theme not in features_by_theme:
            features_by_theme[theme] = []
        features_by_theme[theme].append(col)
    
    print("Feature counts per identified OVERARCHING theme:")
    for theme, cols in sorted(features_by_theme.items()): 
        print(f"- {theme}: {len(cols)} features")
    
    themes_for_intra_analysis = {theme: cols for theme, cols in features_by_theme.items() if len(cols) >= 2 and theme != "Uncategorized"}
    print(f"\nOverarching themes with >=2 features for intra-analysis: {len(themes_for_intra_analysis)}")
    if "Uncategorized" in features_by_theme and len(features_by_theme["Uncategorized"]) > 0:
        print(f"Note: {len(features_by_theme['Uncategorized'])} features were 'Uncategorized'. Review PREFIX_THEME_MAP_FOR_MASTER if this is not expected.")
        print(f"  Sample uncategorized: {features_by_theme['Uncategorized'][:5]}")


else:
    print("mf_df is empty. Cannot categorize features by theme.")
    features_by_theme = {}
    themes_for_intra_analysis = {}

In [None]:
INTRA_CATEGORY_DIR = COV_CORR_DIR / "by_theme"
INTRA_CATEGORY_DIR.mkdir(parents=True, exist_ok=True)

if not mf_df.empty and themes_for_intra_analysis:
    print(f"\n--- Generating Intra-Category Covariance & Correlation Matrices (Based on Overarching Themes) ---")
    for theme_name, theme_cols in themes_for_intra_analysis.items(): 
        print(f"  Processing theme: {theme_name} ({len(theme_cols)} features)")
        
        sanitized_theme_name_for_path = theme_name.replace("(", "").replace(")", "").replace("/", "_").replace(".", "").replace("&", "and").replace(" ", "_").lower()
        
        theme_output_dir = INTRA_CATEGORY_DIR / sanitized_theme_name_for_path
        theme_output_dir.mkdir(parents=True, exist_ok=True)

        temp_df_theme = mf_df_numeric[theme_cols].copy()
        
        temp_df_theme.dropna(axis=1, how='all', inplace=True)
        theme_variances = temp_df_theme.var(ddof=0)
        theme_cols_with_variance = theme_variances[theme_variances > 1e-9].index.tolist()
        
        if len(theme_cols_with_variance) < 2:
            print(f"    Skipping theme '{theme_name}' due to < 2 features with variance.")
            continue
            
        temp_df_theme_final = temp_df_theme[theme_cols_with_variance]

        cov_matrix_theme = temp_df_theme_final.cov()
        cov_matrix_theme.to_csv(theme_output_dir / f"{sanitized_theme_name_for_path}_covariance_matrix.csv")
        
        fig_width = max(10, len(theme_cols_with_variance) * 0.4)
        fig_height = max(8, len(theme_cols_with_variance) * 0.4)
        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(cov_matrix_theme, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
        plt.title(f'Covariance Matrix: {theme_name}', fontsize=16) 
        plt.xticks(fontsize=max(6, 10 - len(theme_cols_with_variance)//5), rotation=90) 
        plt.yticks(fontsize=max(6, 10 - len(theme_cols_with_variance)//5), rotation=0)
        plt.tight_layout()
        plt.savefig(theme_output_dir / f"{sanitized_theme_name_for_path}_covariance_heatmap.png")
        plt.close()

        corr_matrix_theme = temp_df_theme_final.corr()
        corr_matrix_theme.to_csv(theme_output_dir / f"{sanitized_theme_name_for_path}_correlation_matrix.csv")

        plt.figure(figsize=(fig_width, fig_height))
        sns.heatmap(corr_matrix_theme, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, fmt=".2f", linewidths=.5) 
        plt.title(f'Correlation Matrix: {theme_name}', fontsize=16)
        plt.xticks(fontsize=max(6, 10 - len(theme_cols_with_variance)//5), rotation=90)
        plt.yticks(fontsize=max(6, 10 - len(theme_cols_with_variance)//5), rotation=0)
        plt.tight_layout()
        plt.savefig(theme_output_dir / f"{sanitized_theme_name_for_path}_correlation_heatmap.png")
        plt.close()
        
    print("Intra-category matrices and heatmaps saved.")
else:
    print("DataFrame is empty or no themes identified for intra-category analysis.")

In [None]:
if not mf_df.empty and numeric_cols:
    print(f"\n--- Generating Univariate Analysis for ALL {len(numeric_cols)} Numeric Columns (Grouped by Overarching Theme) ---")
    
    unique_themes_for_dirs = set()
    for col in numeric_cols:
        theme = get_theme_for_column(col, PREFIX_THEME_MAP_FOR_MASTER)
        sanitized_theme_name_for_path = theme.replace("(", "").replace(")", "").replace("/", "_").replace(".", "").replace("&", "and").replace(" ", "_").lower()
        unique_themes_for_dirs.add(sanitized_theme_name_for_path)
    
    for sanitized_theme_name in unique_themes_for_dirs:
        (UNIVARIATE_DIR / sanitized_theme_name).mkdir(parents=True, exist_ok=True)

    all_feature_stats = []

    for i, col in enumerate(numeric_cols):
        print(f"Processing univariate plots for: {col} ({i+1}/{len(numeric_cols)})")
        
        theme = get_theme_for_column(col, PREFIX_THEME_MAP_FOR_MASTER)
        sanitized_theme_name_for_path = theme.replace("(", "").replace(")", "").replace("/", "_").replace(".", "").replace("&", "and").replace(" ", "_").lower()
        col_plot_dir = UNIVARIATE_DIR / sanitized_theme_name_for_path
        
        feature_series = mf_df[col].dropna()
        if feature_series.empty:
            print(f"  Skipping {col} as it contains all NaNs.")
            all_feature_stats.append(pd.Series(name=col, dtype=float)) 
            continue

        all_feature_stats.append(feature_series.describe())

        plt.figure(figsize=(8, 5))
        sns.histplot(feature_series, kde=True)
        plt.title(f'Distribution of {col}\n(Theme: {theme})')
        plt.tight_layout()
        plt.savefig(col_plot_dir / f"{col}_distribution.png")
        plt.close()

        plt.figure(figsize=(15, 4))
        mf_df[col].plot()
        plt.title(f'Time Series of {col}\n(Theme: {theme})')
        if 'regime_t' in mf_df.columns and not mf_df['regime_t'].dropna().empty:
            regime_data_aligned = mf_df['regime_t'].reindex(feature_series.index).ffill().bfill()
            if not regime_data_aligned.dropna().empty:
                unique_regimes = sorted(regime_data_aligned.dropna().unique().astype(int))
                palette = sns.color_palette("viridis", n_colors=max(3, len(unique_regimes)))
                for regime_idx, regime_val in enumerate(unique_regimes):
                    if regime_idx < len(palette):
                        color_for_regime = palette[regime_idx]
                    else: 
                        color_for_regime = "gray" 
                    
                    regime_periods = regime_data_aligned[regime_data_aligned == regime_val]
                    if not regime_periods.empty:
                        plt.fill_between(regime_periods.index, feature_series.min(), feature_series.max(), 
                                         where=(regime_data_aligned == regime_val), 
                                         color=color_for_regime, alpha=0.2, label=f'Regime {int(regime_val)}')
                if unique_regimes:
                    plt.legend(loc='upper left', bbox_to_anchor=(1,1))
        plt.tight_layout()
        plt.savefig(col_plot_dir / f"{col}_timeseries.png")
        plt.close()

        if 'regime_t' in mf_df.columns and not mf_df['regime_t'].dropna().empty:
            df_for_boxplot = mf_df[[col, 'regime_t']].dropna()
            if not df_for_boxplot.empty and len(df_for_boxplot['regime_t'].unique()) > 1:
                plt.figure(figsize=(8, 5))
                sns.boxplot(x='regime_t', y=col, data=df_for_boxplot, palette='viridis')
                plt.title(f'{col} by Regime (regime_t)\n(Theme: {theme})')
                plt.tight_layout()
                plt.savefig(col_plot_dir / f"{col}_boxplot_regime.png")
                plt.close()

    if all_feature_stats:
        all_feature_stats_df = pd.concat(all_feature_stats, axis=1)
        all_feature_stats_df.to_csv(UNIVARIATE_DIR / "all_numeric_features_descriptive_stats.csv")
        print(f"\nDescriptive stats for all numeric features saved to: {UNIVARIATE_DIR / 'all_numeric_features_descriptive_stats.csv'}")
else:
    print("DataFrame is empty or no numeric columns, skipping univariate analysis.")

## 7. Analysis of Vintage Data Lags
For each feature with a `_ref_date`, calculate and plot the distribution of its data lag.
Summaries and plots will be saved to `data/reports/eda/master_features/vintage_lag_analysis/`.

In [None]:
if not mf_df.empty:
    vintage_ref_date_cols = [col for col in mf_df.columns if col.endswith('_ref_date')]
    print(f"\n--- Analysis of Vintage Data Lags for: {vintage_ref_date_cols} ---")
    
    all_lag_stats = {}

    for ref_col in vintage_ref_date_cols:
        if ref_col in mf_df.columns and mf_df[ref_col].notna().any():
            try:
                feature_ref_dates = pd.to_datetime(mf_df[ref_col], errors='coerce')
                
                valid_ref_dates_mask = feature_ref_dates.notna()
                lags = (mf_df.index[valid_ref_dates_mask].to_series() - feature_ref_dates[valid_ref_dates_mask]).dt.days
                lags = lags.dropna()

                if not lags.empty:
                    print(f"  Processing lags for {ref_col}...")
                    plt.figure(figsize=(10, 4))
                    sns.histplot(lags, kde=False, bins=min(50, lags.nunique())) # Adjust bins
                    plt.title(f'Distribution of Data Lag for {ref_col} (in days)')
                    plt.xlabel('Lag (Days)')
                    plt.ylabel('Frequency')
                    plt.tight_layout()
                    plt.savefig(VINTAGE_LAG_DIR / f"lag_distribution_{ref_col}.png")
                    plt.close()
                    
                    lag_desc = lags.describe()
                    all_lag_stats[ref_col] = lag_desc
                    print(f"    Summary statistics for lag of '{ref_col}':")
                    display(lag_desc)
                else:
                    print(f"  No valid lag data to plot for {ref_col} (all NaNs or empty after processing).")
            except Exception as e:
                print(f"  Error processing lags for {ref_col}: {e}")
        else:
            print(f"  Column {ref_col} not found or all NaNs, skipping lag analysis.")
    
    if all_lag_stats:
        all_lag_stats_df = pd.DataFrame(all_lag_stats)
        all_lag_stats_df.to_csv(VINTAGE_LAG_DIR / "all_vintage_lag_summary_stats.csv")
        print(f"\nVintage lag summary stats saved to: {VINTAGE_LAG_DIR / 'all_vintage_lag_summary_stats.csv'}")
else:
    print("DataFrame is empty, skipping vintage data lag analysis.")