# Task 3: Event Impact Modeling

## Objective
Model how events (policies, product launches, infrastructure investments) affect financial inclusion indicators.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

# Paths
DATA_DIR = Path('../data/raw')
PROCESSED_DIR = Path('../data/processed')
REPORTS_DIR = Path('../reports/figures')
REPORTS_DIR.mkdir(exist_ok=True, parents=True)

print("Libraries loaded successfully")

## 1. Load and Understand Impact Data

In [None]:
# Load enriched dataset (or original if enrichment not done)
if (PROCESSED_DIR / 'ethiopia_fi_unified_data_enriched.csv').exists():
    df = pd.read_csv(PROCESSED_DIR / 'ethiopia_fi_unified_data_enriched.csv')
    print("Loaded enriched dataset")
else:
    df = pd.read_csv(DATA_DIR / 'ethiopia_fi_unified_data.csv')
    print("Loaded original dataset")

# Extract impact links and events
impact_links = df[df['record_type'] == 'impact_link'].copy()
events = df[df['record_type'] == 'event'].copy()
observations = df[df['record_type'] == 'observation'].copy()

print(f"\nImpact links: {len(impact_links)}")
print(f"Events: {len(events)}")
print(f"Observations: {len(observations)}")

In [None]:
# Join impact links with events using parent_id
if 'parent_id' in impact_links.columns:
    # Get event identifiers - could be parent_id, event_id, or index-based
    # First, let's see what we have
    print("Impact links columns:", impact_links.columns.tolist())
    print("\nSample impact links:")
    print(impact_links.head())
    
    print("\nEvents columns:", events.columns.tolist())
    print("\nSample events:")
    print(events.head())
    
    # Create event lookup - we'll use a combination approach
    # If events have an ID column, use it; otherwise create one
    if 'id' in events.columns:
        event_id_col = 'id'
    elif 'event_id' in events.columns:
        event_id_col = 'event_id'
    else:
        # Create event_id from index
        events['event_id'] = events.index.astype(str)
        event_id_col = 'event_id'
    
    # Create impact link to event mapping
    impact_with_events = impact_links.merge(
        events,
        left_on='parent_id',
        right_on=event_id_col,
        how='left',
        suffixes=('_impact', '_event')
    )
    
    print(f"\nJoined impact links with events: {len(impact_with_events)} records")
    print("\nImpact links with event details:")
    display_cols = ['parent_id', 'event_name', 'category', 'event_date', 
                   'pillar', 'related_indicator', 'impact_direction', 
                   'impact_magnitude', 'lag_months']
    available_cols = [col for col in display_cols if col in impact_with_events.columns]
    print(impact_with_events[available_cols].head(10))
else:
    print("No parent_id column found in impact_links")
    impact_with_events = impact_links.copy()

## 2. Create Event-Indicator Association Matrix

In [None]:
# Build the association matrix
# Rows: Events, Columns: Indicators, Values: Impact magnitude

if len(impact_with_events) > 0 and 'related_indicator' in impact_with_events.columns:
    # Get unique events and indicators
    if 'event_name' in impact_with_events.columns:
        event_col = 'event_name'
    elif 'parent_id' in impact_with_events.columns:
        event_col = 'parent_id'
    else:
        event_col = 'parent_id'
    
    # Create pivot table
    association_matrix = impact_with_events.pivot_table(
        index=event_col,
        columns='related_indicator',
        values='impact_magnitude',
        aggfunc='mean'  # Average if multiple links for same event-indicator pair
    )
    
    print("Event-Indicator Association Matrix:")
    print(association_matrix)
    
    # Visualize as heatmap
    plt.figure(figsize=(max(14, len(association_matrix.columns) * 1.5), 
                       max(8, len(association_matrix) * 0.8)))
    sns.heatmap(association_matrix, annot=True, fmt='.2f', cmap='RdYlGn', 
               center=0, square=False, linewidths=1, 
               cbar_kws={'label': 'Impact Magnitude (pp or multiplier)'})
    plt.title('Event-Indicator Impact Association Matrix', fontsize=16, fontweight='bold')
    plt.xlabel('Indicator', fontsize=12)
    plt.ylabel('Event', fontsize=12)
    plt.tight_layout()
    plt.savefig(REPORTS_DIR / 'event_indicator_association_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save matrix to CSV
    association_matrix.to_csv(PROCESSED_DIR / 'event_indicator_association_matrix.csv')
    print(f"\nMatrix saved to {PROCESSED_DIR / 'event_indicator_association_matrix.csv'}")
else:
    print("Cannot create association matrix - missing required columns")
    association_matrix = pd.DataFrame()

## 3. Model Event Effects Over Time

In [None]:
class EventImpactModel:
    """
    Model to represent how events affect indicators over time.
    
    Assumptions:
    - Effects can be immediate or gradual (with lag)
    - Effects can build over time (ramp-up period)
    - Multiple events can have cumulative effects
    """
    
    def __init__(self):
        self.impact_links = None
        self.events = None
        
    def load_data(self, impact_links_df, events_df):
        """Load impact links and events data"""
        self.impact_links = impact_links_df.copy()
        self.events = events_df.copy()
        
    def calculate_event_effect(self, event_date, indicator, current_date, 
                              impact_magnitude, lag_months=0, ramp_months=12):
        """
        Calculate the effect of an event on an indicator at a given date.
        
        Parameters:
        -----------
        event_date : datetime
            When the event occurred
        indicator : str
            Indicator code
        current_date : datetime
            Date to calculate effect for
        impact_magnitude : float
            Total impact magnitude (percentage points or multiplier)
        lag_months : int
            Months before effect starts
        ramp_months : int
            Months over which effect builds (0 = immediate)
        
        Returns:
        --------
        float : Effect magnitude at current_date
        """
        # Convert dates if needed
        if isinstance(event_date, str):
            event_date = pd.to_datetime(event_date)
        if isinstance(current_date, str):
            current_date = pd.to_datetime(current_date)
        
        # Calculate months since event
        months_since = (current_date.year - event_date.year) * 12 + \
                       (current_date.month - event_date.month)
        
        # Check if effect has started (after lag)
        if months_since < lag_months:
            return 0.0
        
        # Calculate effect
        if ramp_months == 0:
            # Immediate effect
            return impact_magnitude
        else:
            # Gradual ramp-up
            effect_months = months_since - lag_months
            if effect_months >= ramp_months:
                # Full effect
                return impact_magnitude
            else:
                # Partial effect (linear ramp)
                return impact_magnitude * (effect_months / ramp_months)
    
    def get_indicator_impact(self, indicator, date, events_df=None, impact_links_df=None):
        """
        Get total impact on an indicator from all events up to a given date.
        
        Parameters:
        -----------
        indicator : str
            Indicator code
        date : datetime
            Date to calculate impact for
        events_df : DataFrame
            Events dataframe (optional, uses self.events if not provided)
        impact_links_df : DataFrame
            Impact links dataframe (optional, uses self.impact_links if not provided)
        
        Returns:
        --------
        float : Total impact on indicator
        """
        if events_df is None:
            events_df = self.events
        if impact_links_df is None:
            impact_links_df = self.impact_links
        
        # Get all impact links for this indicator
        indicator_links = impact_links_df[
            impact_links_df['related_indicator'] == indicator
        ].copy()
        
        if len(indicator_links) == 0:
            return 0.0
        
        total_impact = 0.0
        
        for _, link in indicator_links.iterrows():
            parent_id = link.get('parent_id', None)
            if parent_id is None:
                continue
            
            # Find corresponding event
            event = events_df[events_df.get('id', events_df.index) == parent_id]
            if len(event) == 0:
                # Try matching by parent_id as string
                event = events_df[events_df.index.astype(str) == str(parent_id)]
            
            if len(event) == 0:
                continue
            
            event = event.iloc[0]
            event_date = pd.to_datetime(event.get('event_date', event.get('observation_date')))
            
            impact_magnitude = link.get('impact_magnitude', 0)
            lag_months = link.get('lag_months', 0)
            
            # Default ramp: 12 months for gradual effects, 0 for immediate
            ramp_months = link.get('ramp_months', 12) if 'ramp_months' in link else 12
            
            # Calculate effect
            effect = self.calculate_event_effect(
                event_date, indicator, date, 
                impact_magnitude, lag_months, ramp_months
            )
            
            # Apply direction (positive or negative)
            direction = link.get('impact_direction', 'positive')
            if direction == 'negative':
                effect = -effect
            
            total_impact += effect
        
        return total_impact

# Initialize model
impact_model = EventImpactModel()
impact_model.load_data(impact_links, events)

print("Event Impact Model initialized")

## 4. Test Model Against Historical Data

In [None]:
# Test: Telebirr launch impact
# Telebirr launched May 2021
# Mobile money accounts: 4.7% (2021) → 9.45% (2024)
# Observed change: +4.75 percentage points

print("=" * 80)
print("VALIDATION: Telebirr Launch Impact")
print("=" * 80)

# Get mobile money account observations
mm_obs = observations[
    (observations['indicator_code'].str.contains('MM_ACCOUNT', case=False, na=False)) |
    (observations['indicator'].str.contains('mobile money', case=False, na=False))
].copy()

if len(mm_obs) > 0 and 'observation_date' in mm_obs.columns:
    mm_obs['observation_date'] = pd.to_datetime(mm_obs['observation_date'])
    mm_obs = mm_obs.sort_values('observation_date')
    
    # Get 2021 and 2024 values
    mm_2021 = mm_obs[mm_obs['observation_date'].dt.year == 2021]
    mm_2024 = mm_obs[mm_obs['observation_date'].dt.year == 2024]
    
    if len(mm_2021) > 0 and len(mm_2024) > 0:
        observed_2021 = mm_2021['value_numeric'].iloc[0]
        observed_2024 = mm_2024['value_numeric'].iloc[0]
        observed_change = observed_2024 - observed_2021
        
        print(f"\nObserved:")
        print(f"  2021: {observed_2021:.2f}%")
        print(f"  2024: {observed_2024:.2f}%")
        print(f"  Change: {observed_change:+.2f} percentage points")
        
        # Get predicted impact from model
        # Find Telebirr event
        telebirr_event = events[
            events.get('event_name', '').str.contains('Telebirr', case=False, na=False) |
            events.get('description', '').str.contains('Telebirr', case=False, na=False)
        ]
        
        if len(telebirr_event) > 0:
            telebirr_date = pd.to_datetime(telebirr_event.iloc[0].get('event_date', '2021-05-01'))
            print(f"\nTelebirr launch date: {telebirr_date.strftime('%Y-%m-%d')}")
            
            # Get impact links for mobile money accounts
            mm_indicator = 'ACC_MM_ACCOUNT'  # or find the actual code
            mm_links = impact_links[
                impact_links['related_indicator'].str.contains('MM', case=False, na=False)
            ]
            
            if len(mm_links) > 0:
                # Calculate predicted impact
                date_2024 = pd.to_datetime('2024-12-31')
                predicted_impact = impact_model.get_indicator_impact(
                    mm_indicator, date_2024
                )
                
                print(f"\nPredicted impact (from model): {predicted_impact:+.2f} percentage points")
                print(f"\nComparison:")
                print(f"  Observed: {observed_change:+.2f}pp")
                print(f"  Predicted: {predicted_impact:+.2f}pp")
                print(f"  Difference: {abs(observed_change - predicted_impact):.2f}pp")
                
                # Validation assessment
                if abs(observed_change - predicted_impact) < 2.0:
                    print("\n✓ Model validation: GOOD - Predicted close to observed")
                elif abs(observed_change - predicted_impact) < 5.0:
                    print("\n⚠ Model validation: MODERATE - Some difference, may need refinement")
                else:
                    print("\n✗ Model validation: NEEDS REFINEMENT - Large difference")
            else:
                print("\nNo impact links found for mobile money accounts")
        else:
            print("\nTelebirr event not found in events data")
    else:
        print("Missing 2021 or 2024 mobile money observations")
else:
    print("Mobile money observations not found or missing dates")

## 5. Refine Impact Estimates

In [None]:
# Refine estimates based on validation results
# This section documents adjustments made based on historical data

refined_impacts = []

print("=" * 80)
print("REFINED IMPACT ESTIMATES")
print("=" * 80)

# Example: Refine Telebirr impact based on observed data
# If model predicted 3pp but observed was 4.75pp, adjust

print("\nRefinement Logic:")
print("1. Compare predicted vs. observed for events with historical data")
print("2. Calculate adjustment factors")
print("3. Apply to similar events without historical data")
print("4. Document confidence levels")

# Create refined impact links dataframe
refined_impact_links = impact_links.copy()

# Add confidence assessment
if 'confidence' not in refined_impact_links.columns:
    refined_impact_links['confidence'] = 'medium'

# Mark high confidence for validated impacts
# (This would be done based on validation results above)
print("\nConfidence levels:")
print("- High: Validated against historical data")
print("- Medium: Based on comparable country evidence")
print("- Low: Expert estimate without direct evidence")

print(f"\nCurrent confidence distribution:")
if 'confidence' in refined_impact_links.columns:
    print(refined_impact_links['confidence'].value_counts())

## 6. Methodology Documentation

In [None]:
# Document methodology
methodology = {
    'approach': 'Event-Impact Modeling with Time-Varying Effects',
    'functional_form': {
        'immediate_effects': 'Step function (full impact after lag period)',
        'gradual_effects': 'Linear ramp-up over specified months',
        'cumulative_effects': 'Additive combination of multiple event impacts'
    },
    'assumptions': [
        'Event impacts are independent (no interaction effects)',
        'Impacts are additive across events',
        'Lag periods represent time before effect starts',
        'Ramp periods represent gradual build-up of effects',
        'Impact magnitudes are in percentage points or multipliers'
    ],
    'limitations': [
        'Limited historical data for validation',
        'Many impacts based on comparable country evidence',
        'No interaction effects between events modeled',
        'Linear ramp-up may not capture all effect patterns',
        'Recent events (M-Pesa) have insufficient post-event data'
    ],
    'sources': {
        'historical_data': 'Global Findex Database, Operator Reports',
        'comparable_countries': 'Kenya, Tanzania, Uganda mobile money experiences',
        'expert_estimates': 'GSMA reports, academic studies'
    }
}

print("=" * 80)
print("METHODOLOGY DOCUMENTATION")
print("=" * 80)

for key, value in methodology.items():
    print(f"\n{key.upper().replace('_', ' ')}:")
    if isinstance(value, dict):
        for k, v in value.items():
            print(f"  {k}: {v}")
    elif isinstance(value, list):
        for i, item in enumerate(value, 1):
            print(f"  {i}. {item}")
    else:
        print(f"  {value}")

# Save methodology
import json
with open(PROCESSED_DIR / 'impact_modeling_methodology.json', 'w') as f:
    json.dump(methodology, f, indent=2)
print(f"\n\nMethodology saved to {PROCESSED_DIR / 'impact_modeling_methodology.json'}")

## 7. Summary and Next Steps

This notebook has:
1. ✅ Loaded and joined impact links with events
2. ✅ Created event-indicator association matrix
3. ✅ Built event impact model with time-varying effects
4. ✅ Validated against historical data (Telebirr example)
5. ✅ Documented methodology and assumptions

**Next Steps for Task 4 (Forecasting):**
- Use this impact model to augment trend forecasts
- Apply event effects to baseline projections
- Generate scenario forecasts (optimistic, base, pessimistic)