In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os

# ==========================================
# CONFIGURATION
# ==========================================
# UPDATE!!!: Local Directory Paths
DATA_DIR = r'data/raw'
OUTPUT_DIR = r'data/processed'
IMG_DIR = r'data/visuals'  # Define visuals folder

# Inputs (From Phase 2)
FILE_INDEX = os.path.join(OUTPUT_DIR, 'ERI_Index_Full.csv')
FILE_CONTEXT = os.path.join(OUTPUT_DIR, 'ERI_Phase2_Track2_Context.csv')

# Outputs
FILE_ANALYTICAL_BASE = os.path.join(OUTPUT_DIR, 'ERI_Phase3_Analytical_Base.csv')
FILE_ARCHETYPES = os.path.join(OUTPUT_DIR, 'ERI_Phase3_Sector_Archetypes.csv')

# Ensure directories exist
for directory in [OUTPUT_DIR, IMG_DIR]:
    if not os.path.exists(directory):
        os.makedirs(directory)

# Sector Map (Standardizing Names)
SECTOR_MAP = {
    'Construction': 'Construction', 'Manufacturing': 'Manufacturing',
    'Information & Communications': 'Information & Communications',
    'Food & Beverage Services': 'Accommodation & Food Services',
    'Accommodation': 'Accommodation & Food Services',
    'Retail Trade': 'Wholesale & Retail Trade', 'Wholesale Trade': 'Wholesale & Retail Trade',
    'Financial & Insurance Services': 'Finance & Insurance', 'Real Estate Services': 'Real Estate',
    'Transportation & Storage': 'Transportation & Storage', 'Professional Services': 'Professional Services',
    'Administrative & Support Services': 'Admin & Support Services', 'Education': 'Education & Health',
    'Health & Social Services': 'Education & Health', 'Arts, Entertainment & Recreation': 'Arts & Recreation'
}

# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":

    # ==========================================
    # 1. DATA INTEGRATION (SINGLE SOURCE OF TRUTH)
    # ==========================================
    print(">> Phase 3: Building Master Dataset...")

    if not os.path.exists(FILE_INDEX) or not os.path.exists(FILE_CONTEXT):
        raise FileNotFoundError("Input files not found. Please run Phase 2 first.")

    # Load Targets (V1 & V2 Scores)
    df_index = pd.read_csv(FILE_INDEX)
    df_index['date'] = pd.to_datetime(df_index['date'])
    df_index['year'] = df_index['date'].dt.year

    # Load Predictors (Digital Context)
    df_context = pd.read_csv(FILE_CONTEXT)
    df_context['year'] = pd.to_datetime(df_context['date']).dt.year

    # Extract Digital Signal (Manual IMDA Data)
    df_digital = df_context[df_context['source_file'] == 'Manual_Input_IMDA_Usage'].copy()
    df_digital['mapped_sector'] = df_digital['clean_sector'].map(SECTOR_MAP).fillna(df_digital['clean_sector'])

    # Aggregating annual digital signal
    digital_annual = df_digital.groupby(['year', 'mapped_sector'])['value'].mean().reset_index()
    digital_annual.rename(columns={'value': 'Digital_Adoption'}, inplace=True)

    # Merge
    df_master = pd.merge(df_index, digital_annual, on=['year', 'mapped_sector'], how='left')
    df_master.to_csv(FILE_ANALYTICAL_BASE, index=False)

    print(f"   Master Data Ready: {len(df_master)} rows")
    print(f"   Saved to: {FILE_ANALYTICAL_BASE}")

    # ==========================================
    # 2. SECTOR ARCHETYPES (THE MISSING LINK)
    # ==========================================
    print("\n>> Generating Sector Archetypes for Notebook 04...")
    
    # Calculate Mean & Volatility
    sector_stats = df_master.groupby('mapped_sector')['ERI_Score_V2'].agg(['mean', 'std']).reset_index()
    sector_stats.columns = ['mapped_sector', 'eri_mean', 'eri_volatility']

    # Define Thresholds
    mean_threshold = sector_stats['eri_mean'].median()
    vol_threshold = sector_stats['eri_volatility'].median()

    # Classification Logic
    def get_archetype(row):
        if row['eri_mean'] >= mean_threshold and row['eri_volatility'] < vol_threshold:
            return 'Stalwart (High Res, Low Vol)'
        elif row['eri_mean'] >= mean_threshold and row['eri_volatility'] >= vol_threshold:
            return 'Cyclical Grower (High Res, High Vol)'
        elif row['eri_mean'] < mean_threshold and row['eri_volatility'] >= vol_threshold:
            return 'Distressed (Low Res, High Vol)'
        else:
            return 'Stagnant (Low Res, Low Vol)'

    sector_stats['archetype'] = sector_stats.apply(get_archetype, axis=1)
    
    # Save to the specific filename Notebook 04 needs
    sector_stats.to_csv(FILE_ARCHETYPES, index=False)
    print(f"   Saved Archetypes to: {FILE_ARCHETYPES}")

    # ==========================================
    # 3. THE 4x2 VISUALIZATION GRID
    # ==========================================
    print("\n>> Generating Side-by-Side Comparison (4 Rows x 2 Columns)...")

    # Setup Canvas
    fig, axes = plt.subplots(4, 2, figsize=(18, 24))
    plt.subplots_adjust(hspace=0.4, wspace=0.3)

    # --- ROW 1: MEDIAN ERI SCORES (Bar Chart) ---
    rank_v1 = df_master.groupby('mapped_sector')['ERI_Score_V1'].median().sort_values(ascending=False)
    rank_v2 = df_master.groupby('mapped_sector')['ERI_Score_V2'].median().sort_values(ascending=False)

    # Fix: Added hue=y and legend=False to silence warnings
    sns.barplot(x=rank_v1.values, y=rank_v1.index, palette="Reds_r", hue=rank_v1.index, legend=False, ax=axes[0, 0])
    axes[0, 0].set_title('Median Score: V1 (Standard)', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Score')
    axes[0, 0].axvline(0.5, color='black', linestyle='--', alpha=0.3)

    sns.barplot(x=rank_v2.values, y=rank_v2.index, palette="Greens_r", hue=rank_v2.index, legend=False, ax=axes[0, 1])
    axes[0, 1].set_title('Median Score: V2 (Robust)', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Score')
    axes[0, 1].axvline(0.5, color='black', linestyle='--', alpha=0.3)

    # --- ROW 2: CORRELATION MATRIX (Heatmap) ---
    cols_v1 = ['ERI_Score_V1', 'Score_Res_V1', 'Score_Abs_V1', 'Score_Rec_V1']
    cols_v2 = ['ERI_Score_V2', 'Score_Res_V2', 'Score_Abs_V2', 'Score_Rec_V2']
    # Safety check for columns
    v1_curr = [c for c in cols_v1 if c in df_master.columns]
    v2_curr = [c for c in cols_v2 if c in df_master.columns]

    sns.heatmap(df_master[v1_curr].corr(), annot=True, cmap='RdBu', vmin=-1, vmax=1, fmt=".2f", ax=axes[1, 0])
    axes[1, 0].set_title('Structure: V1 Correlations', fontsize=12, fontweight='bold')

    sns.heatmap(df_master[v2_curr].corr(), annot=True, cmap='RdBu', vmin=-1, vmax=1, fmt=".2f", ax=axes[1, 1])
    axes[1, 1].set_title('Structure: V2 Correlations', fontsize=12, fontweight='bold')

    # --- ROW 3: DIGITAL ADOPTION vs RESILIENCE (Scatter) ---
    valid_dig = df_master.dropna(subset=['Digital_Adoption'])
    if not valid_dig.empty:
        sns.scatterplot(data=valid_dig, x='Digital_Adoption', y='ERI_Score_V1', hue='mapped_sector', palette='tab20', legend=False, s=80, ax=axes[2, 0])
        sns.regplot(data=valid_dig, x='Digital_Adoption', y='ERI_Score_V1', scatter=False, color='red', ax=axes[2, 0])
        axes[2, 0].set_title(f"Hypothesis Check: V1 (r={valid_dig['ERI_Score_V1'].corr(valid_dig['Digital_Adoption']):.2f})", fontsize=12, fontweight='bold')

        sns.scatterplot(data=valid_dig, x='Digital_Adoption', y='ERI_Score_V2', hue='mapped_sector', palette='tab20', legend=False, s=80, ax=axes[2, 1])
        sns.regplot(data=valid_dig, x='Digital_Adoption', y='ERI_Score_V2', scatter=False, color='green', ax=axes[2, 1])
        axes[2, 1].set_title(f"Hypothesis Check: V2 (r={valid_dig['ERI_Score_V2'].corr(valid_dig['Digital_Adoption']):.2f})", fontsize=12, fontweight='bold')

    # --- ROW 4: SECTOR PROFILES (Heatmap) ---
    dna_v1 = df_master[['mapped_sector'] + v1_curr].groupby('mapped_sector').median()
    dna_v2 = df_master[['mapped_sector'] + v2_curr].groupby('mapped_sector').median()

    sns.heatmap(dna_v1, annot=True, cmap='YlGn', fmt=".2f", ax=axes[3, 0], cbar=False)
    axes[3, 0].set_title('Sector DNA: V1 Profile', fontsize=12, fontweight='bold')
    axes[3, 0].set_ylabel('')

    sns.heatmap(dna_v2, annot=True, cmap='YlGn', fmt=".2f", ax=axes[3, 1], cbar=False)
    axes[3, 1].set_title('Sector DNA: V2 Profile', fontsize=12, fontweight='bold')
    axes[3, 1].set_ylabel('')

    plt.tight_layout()
    # Save BEFORE showing
    plt.savefig(os.path.join(IMG_DIR, '06_Model_Comparison_Grid.png'), dpi=300)
    plt.show()

    # ==========================================
    # 4. COMPARATIVE AUDIT (The "Duel")
    # ==========================================
    print("\n>> Generating Methodology Divergence & Tech Test...")

    # --- CHART A: METHODOLOGY DIVERGENCE ---
    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        data=df_master,
        x='ERI_Score_V1',
        y='ERI_Score_V2',
        hue='mapped_sector',
        palette='tab20',
        s=100,
        alpha=0.7,
        edgecolor='black'
    )

    plt.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='Identity (No Change)')
    plt.fill_between([0, 1], [0, 1], 1, color='green', alpha=0.05, label='V2 Upgrade')
    plt.fill_between([0, 1], 0, [0, 1], color='red', alpha=0.05, label='V2 Penalty')

    plt.title('Methodology Divergence: Who got an Upgrade?', fontsize=14)
    plt.xlabel('V1 Score (Standard Deviation)', fontsize=12)
    plt.ylabel('V2 Score (Semi-Deviation)', fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Sector', borderaxespad=0.)
    plt.grid(True, linestyle=':', alpha=0.6)
    plt.tight_layout()
    plt.savefig(os.path.join(IMG_DIR, '07_Methodology_Divergence.png'), dpi=300)
    plt.show()

    # --- CHART B: THE TECH TEST ---
    target_sector = 'Information & Communications'
    mask_tech = (df_master['mapped_sector'] == target_sector) & \
                (df_master['date'] >= '2020-01-01') & \
                (df_master['date'] <= '2025-12-31')
    tech_data = df_master[mask_tech].sort_values('date')

    if not tech_data.empty:
        plt.figure(figsize=(12, 6))
        plt.plot(tech_data['date'], tech_data['ERI_Score_V1'],
                 color='red', linestyle='--', marker='o', label='V1: Standard')
        plt.plot(tech_data['date'], tech_data['ERI_Score_V2'],
                 color='green', linewidth=2.5, marker='s', label='V2: Robust')

        plt.title(f'The Tech Test: {target_sector} (2020-2025)', fontsize=14)
        plt.ylabel('Resilience Score', fontsize=12)
        plt.legend(loc='upper left')
        plt.grid(True, alpha=0.3)
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=6))
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(IMG_DIR, '09_Tech_Test_Trend.png'), dpi=300)
        plt.show()
    else:
        print(f"⚠️ No data found for {target_sector} between 2020-2025.")
    
    # --- CHART C: NATIONAL AGGREGATE (Restored) ---
    print(">> Generating National Aggregate Comparison...")
    national_trends = df_master.groupby('date')[['ERI_Score_V1', 'ERI_Score_V2']].mean().reset_index()

    plt.figure(figsize=(12, 6))
    plt.plot(national_trends['date'], national_trends['ERI_Score_V1'], color='red', linestyle='--', label='V1: Standard (Volatile)')
    plt.plot(national_trends['date'], national_trends['ERI_Score_V2'], color='green', linewidth=2.5, label='V2: Robust (Stable)')
    
    plt.title('Singapore National Economic Resilience: V1 vs V2 (2014-2024)', fontsize=14)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.savefig(os.path.join(IMG_DIR, '08_National_Aggregate_Comparison.png'), dpi=300)
    plt.show()

    print(">> Notebook 03 Complete. Ready for Modeling.")