# 📈 Integrated Dividend Feature Analysis

This notebook combines visual correlation analysis, regression diagnostics, and model-based feature importance to analyze the impact of various features on dividend-related metrics.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


In [None]:
# --- Configuration ---
FEATURES_PATH = 'features_parquet/tickers_data/features_all_tickers_timeseries.parquet'
MACRO_PATH = 'features_parquet/macro/united_states.parquet'

DIVIDEND_FEATURES = [
    'dividend_yield',
    'dividend_cagr_3y',
    'dividend_cagr_5y',
    'yield_vs_5y_median'
]

BINARY_FEATURES = [
    'sector_technology', 'sector_healthcare', 'sector_utilities',
    'sector_basic_materials', 'sector_financial_services', 'sector_consumer_cyclical',
    'sector_real_estate', 'sector_communication_services', 'sector_industrials',
    'sector_energy', 'sector_materials', 'sector_consumer_defensive',
    'ebit_interest_cover_capped',
    'has_eps_cagr_3y', 'has_fcf_cagr_3y', 'has_dividend_yield',
    'has_dividend_cagr_3y', 'has_dividend_cagr_5y', 'has_ebit_interest_cover'
]

EXCLUDE_FEATURES = ['ticker', 'as_of', 'as_of_year', 'country'] + DIVIDEND_FEATURES

print("Loading data...")
df_features = pd.read_parquet(FEATURES_PATH)
df_macro = pd.read_parquet(MACRO_PATH)

df_features['as_of'] = pd.to_datetime(df_features['as_of'])
df_features['as_of_year'] = df_features['as_of'].dt.year

df_merged = pd.merge(
    df_features,
    df_macro,
    on=['country', 'as_of_year'],
    how='left'
)

df_merged.drop(columns=['backfilled_year'], errors='ignore', inplace=True)
print("Data loaded and merged successfully.")


In [None]:
all_columns = df_merged.columns.tolist()
independent_features = [col for col in all_columns if col not in EXCLUDE_FEATURES]
numerical_independent_features = [f for f in independent_features if f not in BINARY_FEATURES]
categorical_independent_features = [f for f in independent_features if f in BINARY_FEATURES]

In [None]:
predictor_categories = {
    'Financial_Metrics': [f for f in numerical_independent_features if 'return' in f or 'ratio' in f or 'cagr' in f or 'cover' in f],
    'Macro_Economic': ['gdp_yoy_backfilled', 'inflation_latest', 'unemployment_latest', 'consumption_backfilled'],
    'Sector_Features': [f for f in df_merged.columns if f.startswith('sector_')]
}

In [None]:
def analyze_feature_impact(target_feature):
    df_analysis = df_merged[df_merged[target_feature].notna()].copy()
    if df_analysis.empty:
        print(f"No data available for '{target_feature}'. Skipping.")
        return

    print(f"\n🔍 Analyzing: {target_feature}")
    # Scatter + Regression plots
    for feature in numerical_independent_features:
        if feature in df_analysis.columns and df_analysis[feature].notna().sum() >= 3:
            plt.figure(figsize=(7, 4), )
            sns.scatterplot(x=feature, y=target_feature, data=df_analysis, alpha=0.6)
            sns.regplot(x=feature, y=target_feature, data=df_analysis, scatter=False, color='red', line_kws={'alpha':0.6})
            plt.title(f'{target_feature} vs. {feature}')
            plt.tight_layout()
            plt.show()

    # Boxplots for categorical
    for feature in categorical_independent_features:
        if feature in df_analysis.columns:
            df_analysis[feature] = df_analysis[feature].astype('category')
            plt.figure(figsize=(7, 4), )
            sns.boxplot(x=feature, y=target_feature, data=df_analysis)
            plt.title(f'{target_feature} distribution by {feature}')
            plt.tight_layout()
            plt.show()

    # Correlation Bar Plots
    fig, axes = plt.subplots(1, len(predictor_categories), figsize=(5 * len(predictor_categories), 4))
    if len(predictor_categories) == 1:
        axes = [axes]

    for ax, (group_name, group_features) in zip(axes, predictor_categories.items()):
        valid_feats = [f for f in group_features if f in df_analysis.columns and df_analysis[f].notna().sum() > 2]
        if not valid_feats:
            ax.set_visible(False)
            continue
        corr_vals = df_analysis[valid_feats + [target_feature]].corr()[target_feature].drop(target_feature).dropna()
        corr_vals = corr_vals.sort_values()
        colors = ['red' if v < 0 else 'blue' for v in corr_vals]
        ax.barh(corr_vals.index.str.replace('_', ' ').str.title(), corr_vals.values, color=colors)
        ax.set_title(f"{group_name} Correlation")
        ax.axvline(0, color='black', lw=0.8)
        ax.grid(True, linestyle='--', alpha=0.5)

    plt.tight_layout()
    plt.show()

    # Random Forest Feature Importance
    rf_features = [f for group in predictor_categories.values() for f in group if f in df_analysis.columns]
    model_data = df_analysis[rf_features + [target_feature]].dropna()
    if len(model_data) < 10:
        print("Not enough data for modeling.")
        return

    X = model_data[rf_features].replace([np.inf, -np.inf], np.nan).dropna()
    y = model_data.loc[X.index, target_feature]

    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X, y)
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf_model.feature_importances_
    }).sort_values('Importance')

    # Plot feature importance using subplots
    fig, ax = plt.subplots(figsize=(8, 0.3 * len(importance_df)))

    colors = plt.cm.viridis(np.linspace(0, 1, len(importance_df)))
    bars = ax.barh(range(len(importance_df)), importance_df['Importance'], color=colors)

    ax.set_yticks(range(len(importance_df)))
    ax.set_yticklabels([f.replace('_', ' ').title() for f in importance_df['Feature']])
    ax.set_xlabel('Feature Importance (Random Forest)')
    ax.set_title(f'Feature Importance for Predicting {target_feature.replace("_", " ").title()}')
    ax.grid(True, alpha=0.3)

    # Add value labels
    for bar, importance in zip(bars, importance_df['Importance']):
        ax.text(importance + 0.001, bar.get_y() + bar.get_height()/2, 
                f'{importance:.3f}', ha='left', va='center', fontsize=8)

    plt.tight_layout()
    plt.show()


    print(f"Model R² Score: {r2_score(y, rf_model.predict(X)):.3f}")

In [None]:
for target in DIVIDEND_FEATURES:
    analyze_feature_impact(target)

In [None]:
# 🔥 Partial Correlation Heatmap using valid features only
correlation_features = DIVIDEND_FEATURES + numerical_independent_features
valid_corr_features = [
    col for col in correlation_features
    if col in df_merged.columns and df_merged[col].dtype in ['float64', 'int64']
]

# Keep only columns with ≥5 non-NaN values
valid_counts = df_merged[valid_corr_features].notna().sum()
selected_features = valid_counts[valid_counts >= 5].index.tolist()

if len(selected_features) >= 2:
    corr_matrix = df_merged[selected_features].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0, linewidths=0.5)
    plt.title("Correlation Heatmap: Dividend Features vs Numerical Predictors (≥5 valid values)")
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ Too few features with at least 5 non-NaN values for a correlation heatmap.")
