# Task 3 — Event Impact Modeling (Ethiopia Financial Inclusion)

This notebook is configured for your project paths:

## Inputs (priority)
- `data/processed/eda_enriched/events.csv`
- `data/processed/eda_enriched/ethiopia_fi_unified_data__impact_links.csv`
- `data/processed/eda_enriched/temporal_range__observations.csv`

## Fallback input
- `data/raw/ethiopia_fi_unified_data.csv` (only if processed files missing)

## Outputs
- `outputs/task_3/impact_links_summary.csv`
- `outputs/task_3/event_effects_tidy.csv`
- `outputs/task_3/event_indicator_association_matrix.csv`
- `outputs/task_3/event_indicator_association_heatmap.png`

## Modules
- `fi.data_io`
- `fi.impact_links`
- `fi.event_effects`
- `fi.association_matrix`
- `fi.task3_validation` (optional validation cell)

In [None]:
from __future__ import annotations
import numpy as np

import sys
from pathlib import Path

ROOT = Path("..").resolve()
SRC_DIR = ROOT / "src"

# Ensure the repo root is importable (so `import src...` works)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Optional: also add src directly (lets you `import fi...` if you prefer)
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

print("ROOT:", ROOT)
print("sys.path[0:3]:", sys.path[:3])



import pandas as pd



# --- imports from src/fi (package)
from src.fi.data_io import load_csv, coerce_datetime
from src.fi.impact_links import join_links_events, build_impact_links_summary
from src.fi.event_effects import FINDEX_YEAR_GRID, effects_tidy
from src.fi.association_matrix import build_association_matrix, plot_heatmap
from src.fi.task3_validation import validate_telebirr_mm

ROOT = Path('..').resolve()

# Inputs (processed preferred)
PROC_DIR = ROOT / 'data' / 'processed' / 'eda_enriched'
EVENTS_PATH = PROC_DIR / 'events.csv'
LINKS_PATH  = PROC_DIR / 'ethiopia_fi_unified_data__impact_links.csv'
OBS_PATH    = PROC_DIR / 'temporal_range__observations.csv'

# Fallback unified raw
UNIFIED_RAW_PATH = ROOT / 'data' / 'raw' / 'ethiopia_fi_unified_data.csv'

# Outputs
OUT_DIR = ROOT / 'outputs' / 'task_3'
OUT_DIR.mkdir(parents=True, exist_ok=True)

KEY_INDICATORS = ["ACC_OWNERSHIP", "ACC_MM_ACCOUNT", "USG_DIGITAL_PAYMENT"]

pd.set_option('display.width', 140)
pd.set_option('display.max_columns', 80)

print('ROOT:', ROOT)
print('EVENTS_PATH:', EVENTS_PATH, 'exists=', EVENTS_PATH.exists())
print('LINKS_PATH :', LINKS_PATH,  'exists=', LINKS_PATH.exists())
print('OBS_PATH   :', OBS_PATH,    'exists=', OBS_PATH.exists())
print('UNIFIED_RAW_PATH:', UNIFIED_RAW_PATH, 'exists=', UNIFIED_RAW_PATH.exists())
print('OUT_DIR:', OUT_DIR)

## 1) Load inputs (processed preferred; raw unified fallback)

If any processed input is missing, we attempt to derive it from the raw unified file (requires `record_type` + relevant columns).

In [None]:
def _derive_from_unified(unified: pd.DataFrame):
    """Derive events, links, observations from unified if it contains record_type."""
    if 'record_type' not in unified.columns:
        raise ValueError('Raw unified file is missing column: record_type')

    events_u = unified.loc[unified['record_type'] == 'event'].copy() if (unified['record_type'] == 'event').any() else pd.DataFrame()
    links_u  = unified.loc[unified['record_type'] == 'impact_link'].copy() if (unified['record_type'] == 'impact_link').any() else pd.DataFrame()
    obs_u    = unified.loc[unified['record_type'] == 'observation'].copy() if (unified['record_type'] == 'observation').any() else pd.DataFrame()
    return events_u, links_u, obs_u


# Load processed inputs when present
events = load_csv(str(EVENTS_PATH)) if EVENTS_PATH.exists() else pd.DataFrame()
links  = load_csv(str(LINKS_PATH))  if LINKS_PATH.exists()  else pd.DataFrame()
obs    = load_csv(str(OBS_PATH))    if OBS_PATH.exists()    else pd.DataFrame()

obs_required_any = {"indicator_code", "related_indicator"}
value_cols_any = {"value_numeric", "value", "observed_value"}
date_cols_any = {"observation_date", "date", "period_end", "period_start", "event_date"}

obs_is_observations = (
    (not obs.empty)
    and (len(obs_required_any.intersection(set(obs.columns))) > 0)
    and (len(value_cols_any.intersection(set(obs.columns))) > 0)
    and (("year" in obs.columns) or (len(date_cols_any.intersection(set(obs.columns))) > 0))
)

if not obs_is_observations:
    print("[task3-notebook] OBS_PATH is not an observations table; skipping Telebirr validation.")
    obs = pd.DataFrame()

missing = {
    'events': events.empty,
    'links': links.empty,
    'obs': obs.empty,
}
print('Missing (processed):', missing)

# Fallback: derive missing parts from raw unified
if any(missing.values()):
    if not UNIFIED_RAW_PATH.exists():
        raise FileNotFoundError(
            'Some processed inputs are missing, and raw unified fallback was not found at: ' + str(UNIFIED_RAW_PATH)
        )

    unified_raw = pd.read_csv(UNIFIED_RAW_PATH)
    print('Loaded raw unified:', unified_raw.shape)
    if 'record_type' in unified_raw.columns:
        display(unified_raw['record_type'].value_counts(dropna=False))

    events_u, links_u, obs_u = _derive_from_unified(unified_raw)

    if events.empty:
        events = events_u
    if links.empty:
        links = links_u
    if obs.empty:
        obs = obs_u

# Coerce dates if columns exist
events = coerce_datetime(events, 'observation_date') if not events.empty else events
obs    = coerce_datetime(obs, 'observation_date') if not obs.empty else obs

print('events:', events.shape)
print('links :', links.shape)
print('obs   :', obs.shape)

display(events.head(3) if not events.empty else pd.DataFrame({'note':['events is empty']}))
display(links.head(3) if not links.empty else pd.DataFrame({'note':['links is empty']}))
display(obs.head(3) if not obs.empty else pd.DataFrame({'note':['obs is empty']}))

## 2) Join impact links ↔ events and export `impact_links_summary.csv`

In [None]:
if links.empty:
    raise ValueError(
        'Impact links input is empty. Expected rows in: ' + str(LINKS_PATH) +
        ' (or impact_link rows inside raw unified fallback).'
    )
if events.empty:
    raise ValueError(
        'Events input is empty. Expected rows in: ' + str(EVENTS_PATH) +
        ' (or event rows inside raw unified fallback).'
    )

joined = join_links_events(links, events)
summary = build_impact_links_summary(joined)

summary_out = OUT_DIR / 'impact_links_summary.csv'
summary.to_csv(summary_out, index=False)

print('joined:', getattr(joined, 'shape', None))
print('summary:', summary.shape)
print('wrote:', summary_out)
display(summary.head(15))

## 3) Build event effect series on the Findex year grid and export `event_effects_tidy.csv`

In [None]:
effects = effects_tidy(
    df_summary=summary,
    indicators=KEY_INDICATORS,
    years=FINDEX_YEAR_GRID,
    default_shape='ramp',
    default_ramp_years=3.0
)

effects_out = OUT_DIR / 'event_effects_tidy.csv'
effects.to_csv(effects_out, index=False)

print('effects:', effects.shape)
print('wrote:', effects_out)
display(effects.head(30))

## 4) Build association matrix + heatmap

Exports:
- `event_indicator_association_matrix.csv`
- `event_indicator_association_heatmap.png`

In [None]:
assoc = build_association_matrix(summary, KEY_INDICATORS)

assoc_out = OUT_DIR / 'event_indicator_association_matrix.csv'
heatmap_out = OUT_DIR / 'event_indicator_association_heatmap.png'

assoc.to_csv(assoc_out, index=False)
plot_heatmap(assoc, KEY_INDICATORS, str(heatmap_out))

print('assoc:', assoc.shape)
print('wrote:', assoc_out)
print('wrote:', heatmap_out)
display(assoc)

In [None]:
import pandas as pd

summary = pd.read_csv("../outputs/task_3/impact_links_summary.csv")

tele = summary[summary["event_name"].astype(str).str.contains("telebirr", case=False, na=False)]
print("telebirr links:", tele.shape)
display(tele[["event_name", "indicator_code", "impact_magnitude_pp"]])

tele_mm = tele[tele["indicator_code"].astype(str).str.upper().eq("ACC_MM_ACCOUNT")]
print("telebirr -> ACC_MM_ACCOUNT:", tele_mm.shape)
display(tele_mm[["event_name", "indicator_code", "impact_magnitude_pp"]])

## 5) (Optional) Telebirr validation table

If observations are present, we compute the Telebirr residual table. If not, we skip.

In [None]:
# Validate Telebirr against all indicators it targets in the links summary
if obs.empty:
    print("Observations are empty; skipping Telebirr validation.")
else:
    tele_mask = summary["event_name"].astype(str).str.contains("telebirr", case=False, na=False)
    tele_targets = (
        summary.loc[tele_mask, "indicator_code"]
        .astype(str).str.strip()
        .replace({"": float("nan")})
        .dropna()
        .unique()
        .tolist()
    )
    tele_targets = sorted(tele_targets)

    print("Telebirr target indicators found in links:", tele_targets)

    frames = []
    for ind in tele_targets:
        frames.append(
            validate_telebirr_mm(
                obs,
                summary,
                year_a=2021,
                year_b=2024,
                target_indicator=ind,   # <-- key change
                event_regex="telebirr",
            )
        )

    tele_val = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    tele_out = OUT_DIR / "telebirr_validation_table.csv"
    tele_val.to_csv(tele_out, index=False)
    print("wrote:", tele_out)
    display(tele_val)