In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import os

# ==========================================
# CONFIGURATION
# ==========================================
# UPDATE!!!: Local Directory Paths
DATA_DIR = r'data/raw'
OUTPUT_DIR = r'data/processed'
IMG_DIR = r'data/visuals'
INPUT_FILE = os.path.join(OUTPUT_DIR, '01_Data_Prep_Master.csv')

# ==========================================
# 1. DIAGNOSTICS & CUTOFF
# ==========================================
df = pd.read_csv(INPUT_FILE)

# (N, T) Pivot for heatmap
presence = df.pivot_table(index='clean_sector', columns='year', values='value', aggfunc='count').fillna(0)
binary_presence = (presence > 0).astype(int)

# Plot Availability
plt.figure(figsize=(16, 6))
sns.heatmap(binary_presence, cmap="viridis", cbar=False)
plt.title('Data Density Diagnostic', fontsize=14)
plt.axvline(x=binary_presence.columns.get_loc(2010), color='red', linestyle='--', label='Cutoff (2010)')
plt.legend()
plt.show()

# ==========================================
# 2. ARCHITECTURE SPLIT (RESTORED ORIGINAL LOGIC)
# ==========================================
# Robust Date Creation
df['date'] = pd.to_datetime(
    df['year'].astype(str) + '-' + (df['quarter'] * 3).astype(str) + '-01'
) + pd.offsets.MonthEnd(0)

# Define Window (2010+)
df_modern = df[df['year'] >= 2010].copy()

# --- CREATE TRACK 1: THE INDEX ENGINE ---
df_model = df_modern.groupby(
    ['date', 'year', 'quarter', 'clean_sector', 'raw_variable', 'source_file']
)['value'].sum().reset_index()

# --- CREATE TRACK 2: THE CONTEXT VAULT ---
df_context = df_modern.copy()

# SAVE TRACKS IMMEDIATELY
df_model.to_csv(os.path.join(OUTPUT_DIR, 'ERI_Phase2_Track1_Model.csv'), index=False)
df_context.to_csv(os.path.join(OUTPUT_DIR, 'ERI_Phase2_Track2_Context.csv'), index=False)

# ==========================================
# 3. FEATURE ENGINEERING & MAPPING
# ==========================================
def identify_metric(row):
    src = str(row['source_file']).lower()
    if 'contribution_to_growth' in src: return 'GDP_CONTRIB'
    if 'employment_change' in src: return 'EMP_CHANGE'
    return 'OTHER'

df_model['metric_type'] = df_model.apply(identify_metric, axis=1)
df_clean = df_model[df_model['metric_type'] != 'OTHER'].copy()

# Sector Harmonization
SECTOR_MAP = {
    'Food & Beverage Services': 'Accommodation & Food Services',
    'Accommodation': 'Accommodation & Food Services',
    'Administrative & Support Services': 'Admin & Support Services',
    'Real Estate, Professional Services And Administrative & Support Services': 'Professional Services',
    'Wholesale Trade': 'Wholesale & Retail Trade',
    'Retail Trade': 'Wholesale & Retail Trade',
    'Financial & Insurance Services': 'Finance & Insurance',
    'Other Services Industries': 'Other Services',
    'Arts, Entertainment & Recreation': 'Arts & Recreation'
}
df_clean['mapped_sector'] = df_clean['clean_sector'].replace(SECTOR_MAP)

# Pivot to Wide
df_pivot = df_clean.pivot_table(
    index=['mapped_sector', 'date', 'year', 'quarter'],
    columns='metric_type',
    values='value',
    aggfunc='sum'
).reset_index().sort_values(by=['mapped_sector', 'year', 'quarter'])

# Forward Fill GDP
df_pivot['GDP_FILLED'] = df_pivot.groupby(['mapped_sector', 'year'])['GDP_CONTRIB'].transform(
    lambda x: x.ffill().bfill()
)

# ==========================================
# 4. ENGINE CORE (V1 & V2)
# ==========================================
WINDOW = 12

def rolling_semidev(x):
    m = x.mean()
    downside = x[x < m]
    return np.sqrt(np.mean((downside - m)**2)) if len(downside) > 0 else 0.0

print("Calculating Indicators (V1 Standard & V2 Robust)...")

g = df_pivot.groupby('mapped_sector')

# V1 (Standard Deviation)
df_pivot['V1_Vol_GDP'] = g['GDP_FILLED'].transform(lambda x: x.rolling(WINDOW).std())
df_pivot['V1_Vol_EMP'] = g['EMP_CHANGE'].transform(lambda x: x.rolling(WINDOW).std())
df_pivot['V1_Growth']  = g['GDP_FILLED'].transform(lambda x: x.rolling(WINDOW).mean())

# V2 (Semi-Deviation)
df_pivot['V2_Vol_GDP'] = g['GDP_FILLED'].transform(lambda x: x.rolling(WINDOW).apply(rolling_semidev, raw=False))
df_pivot['V2_Vol_EMP'] = g['EMP_CHANGE'].transform(lambda x: x.rolling(WINDOW).apply(rolling_semidev, raw=False))

# Drop Burn-In
df_final = df_pivot.dropna(subset=['V1_Vol_GDP', 'V1_Vol_EMP', 'V1_Growth']).copy()

# ==========================================
# 5. SCORING
# ==========================================
scaler = MinMaxScaler()

# --- V1 SCORING ---
df_final['S1_Raw'] = -1 * df_final['V1_Vol_GDP']
df_final['L1_Raw'] = -1 * df_final['V1_Vol_EMP']
df_final['G1_Raw'] = df_final['V1_Growth']

v1_norm = scaler.fit_transform(df_final[['S1_Raw', 'L1_Raw', 'G1_Raw']])
df_final['Score_Res_V1'] = v1_norm[:, 0]
df_final['Score_Abs_V1'] = v1_norm[:, 1]
df_final['Score_Rec_V1'] = v1_norm[:, 2]

df_final['ERI_Score_V1'] = (0.4 * df_final['Score_Res_V1'] + 0.3 * df_final['Score_Abs_V1'] + 0.3 * df_final['Score_Rec_V1'])

# --- V2 SCORING ---
df_final['S2_Raw'] = -1 * df_final['V2_Vol_GDP']
df_final['L2_Raw'] = -1 * df_final['V2_Vol_EMP']

v2_norm = scaler.fit_transform(df_final[['S2_Raw', 'L2_Raw', 'G1_Raw']])
df_final['Score_Res_V2'] = v2_norm[:, 0] + 0.01
df_final['Score_Abs_V2'] = v2_norm[:, 1] + 0.01
df_final['Score_Rec_V2'] = v2_norm[:, 2] + 0.01

df_final['ERI_Score_V2'] = (df_final['Score_Res_V2'] ** 0.4) * (df_final['Score_Abs_V2'] ** 0.3) * (df_final['Score_Rec_V2'] ** 0.3)
df_final['ERI_Score_V2'] = scaler.fit_transform(df_final[['ERI_Score_V2']])

# ==========================================
# 6. SAVE ARTIFACTS
# ==========================================
cols_meta = ['date', 'year', 'quarter', 'mapped_sector']
cols_v1 = ['ERI_Score_V1', 'Score_Res_V1', 'Score_Abs_V1', 'Score_Rec_V1']
cols_v2 = ['ERI_Score_V2', 'Score_Res_V2', 'Score_Abs_V2', 'Score_Rec_V2']

df_final[cols_meta + cols_v1 + cols_v2].to_csv(os.path.join(OUTPUT_DIR, 'ERI_Index_Full.csv'), index=False)

latest_date = df_final['date'].max()
leaderboard_v1 = df_final[df_final['date'] == latest_date].sort_values(by='ERI_Score_V1', ascending=False)
leaderboard_v2 = df_final[df_final['date'] == latest_date].sort_values(by='ERI_Score_V2', ascending=False)

leaderboard_v1[['mapped_sector', 'ERI_Score_V1', 'ERI_Score_V2']].to_csv(os.path.join(OUTPUT_DIR, 'ERI_Leaderboard_Latest.csv'), index=False)

# ==========================================
# 7. FINAL PRINTOUT
# ==========================================
print(f"--- PHASE 2 COMPLETE ---")
print(f"Track 2 Context Rows: {len(df_context)}")
print(f"Index Engine Rows:    {len(df_final)}")
print(f"Latest Date:          {latest_date}")
print(f"Saved: ERI_Index_Full.csv & ERI_Leaderboard_Latest.csv")

print("\n--- TOP 5 SECTORS: V1 (Standard) ---")
print(leaderboard_v1[['mapped_sector', 'ERI_Score_V1']].head(5))

print("\n--- TOP 5 SECTORS: V2 (Robust/Semi-Dev) ---")
print(leaderboard_v2[['mapped_sector', 'ERI_Score_V2']].head(5))