# Task 4: Forecasting Access and Usage

## Objective
Forecast Account Ownership (Access) and Digital Payment Usage for 2025-2027.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from pathlib import Path
import warnings
import sys
warnings.filterwarnings('ignore')

# Import event impact model from Task 3
sys.path.append('../src')
try:
    from task3_event_impact_modeling import EventImpactModel
except:
    # If import fails, we'll define it here
    pass

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

# Paths
DATA_DIR = Path('../data/raw')
PROCESSED_DIR = Path('../data/processed')
REPORTS_DIR = Path('../reports/figures')
REPORTS_DIR.mkdir(exist_ok=True, parents=True)

print("Libraries loaded successfully")

## 1. Load Data and Prepare Time Series

In [None]:
# Load data
if (PROCESSED_DIR / 'ethiopia_fi_unified_data_enriched.csv').exists():
    df = pd.read_csv(PROCESSED_DIR / 'ethiopia_fi_unified_data_enriched.csv')
else:
    df = pd.read_csv(DATA_DIR / 'ethiopia_fi_unified_data.csv')

# Extract observations
observations = df[df['record_type'] == 'observation'].copy()
events = df[df['record_type'] == 'event'].copy()
impact_links = df[df['record_type'] == 'impact_link'].copy()

# Prepare account ownership time series
acc_obs = observations[
    (observations['indicator_code'].str.contains('ACC_OWNERSHIP', case=False, na=False)) |
    (observations['indicator'] == 'Account Ownership Rate')
].copy()

if 'observation_date' in acc_obs.columns:
    acc_obs['observation_date'] = pd.to_datetime(acc_obs['observation_date'])
    acc_obs = acc_obs.sort_values('observation_date')
    acc_obs['year'] = acc_obs['observation_date'].dt.year
    acc_obs['year_numeric'] = (acc_obs['observation_date'] - acc_obs['observation_date'].min()).dt.days / 365.25

print("Account Ownership Data:")
print(acc_obs[['year', 'value_numeric']].to_string(index=False))

# Prepare digital payment usage time series
usage_obs = observations[
    (observations['indicator_code'].str.contains('USG_DIGITAL', case=False, na=False)) |
    (observations['indicator_code'].str.contains('DIGITAL_PAYMENT', case=False, na=False)) |
    (observations['pillar'] == 'usage')
].copy()

if 'observation_date' in usage_obs.columns and len(usage_obs) > 0:
    usage_obs['observation_date'] = pd.to_datetime(usage_obs['observation_date'])
    usage_obs = usage_obs.sort_values('observation_date')
    usage_obs['year'] = usage_obs['observation_date'].dt.year
    usage_obs['year_numeric'] = (usage_obs['observation_date'] - usage_obs['observation_date'].min()).dt.days / 365.25
    
    print("\nDigital Payment Usage Data:")
    print(usage_obs[['year', 'value_numeric']].to_string(index=False))
else:
    print("\nLimited digital payment usage data available")

## 2. Baseline Trend Forecast (Linear Regression)

In [None]:
def fit_trend_model(data, year_col='year', value_col='value_numeric'):
    """
    Fit linear trend model to time series data.
    
    Returns:
    --------
    model : LinearRegression model
    predictions : dict with forecast years and values
    confidence_intervals : dict with confidence intervals
    """
    if len(data) < 2:
        return None, {}, {}
    
    X = data[[year_col]].values
    y = data[value_col].values
    
    # Fit linear regression
    model = LinearRegression()
    model.fit(X, y)
    
    # Calculate predictions for historical and future years
    historical_years = data[year_col].values
    forecast_years = np.array([2025, 2026, 2027])
    all_years = np.concatenate([historical_years, forecast_years])
    
    predictions = model.predict(all_years.reshape(-1, 1))
    
    # Calculate confidence intervals
    # Using simple approach: std of residuals * t-statistic
    residuals = y - model.predict(X)
    std_error = np.std(residuals)
    n = len(data)
    t_value = stats.t.ppf(0.975, n - 2)  # 95% confidence
    
    # Confidence interval width
    ci_width = t_value * std_error * np.sqrt(1 + 1/n + (all_years - np.mean(X))**2 / np.sum((X - np.mean(X))**2))
    
    results = {
        'years': all_years,
        'predictions': predictions,
        'lower_ci': predictions - ci_width,
        'upper_ci': predictions + ci_width,
        'model': model,
        'r_squared': model.score(X, y)
    }
    
    return results

# Forecast Account Ownership
print("=" * 80)
print("ACCOUNT OWNERSHIP FORECAST (Baseline Trend)")
print("=" * 80)

if len(acc_obs) >= 2:
    acc_forecast = fit_trend_model(acc_obs, 'year', 'value_numeric')
    
    if acc_forecast:
        print(f"\nModel R-squared: {acc_forecast['r_squared']:.3f}")
        print("\nForecast (Baseline Trend):")
        forecast_mask = acc_forecast['years'] >= 2025
        for i, year in enumerate(acc_forecast['years'][forecast_mask]):
            idx = np.where(acc_forecast['years'] == year)[0][0]
            pred = acc_forecast['predictions'][idx]
            lower = acc_forecast['lower_ci'][idx]
            upper = acc_forecast['upper_ci'][idx]
            print(f"  {int(year)}: {pred:.1f}% (95% CI: {lower:.1f}% - {upper:.1f}%)")
else:
    print("Insufficient data for trend forecast")
    acc_forecast = None

In [None]:
# Forecast Digital Payment Usage
print("\n" + "=" * 80)
print("DIGITAL PAYMENT USAGE FORECAST (Baseline Trend)")
print("=" * 80)

if len(usage_obs) >= 2:
    usage_forecast = fit_trend_model(usage_obs, 'year', 'value_numeric')
    
    if usage_forecast:
        print(f"\nModel R-squared: {usage_forecast['r_squared']:.3f}")
        print("\nForecast (Baseline Trend):")
        forecast_mask = usage_forecast['years'] >= 2025
        for i, year in enumerate(usage_forecast['years'][forecast_mask]):
            idx = np.where(usage_forecast['years'] == year)[0][0]
            pred = usage_forecast['predictions'][idx]
            lower = usage_forecast['lower_ci'][idx]
            upper = usage_forecast['upper_ci'][idx]
            print(f"  {int(year)}: {pred:.1f}% (95% CI: {lower:.1f}% - {upper:.1f}%)")
else:
    print("Insufficient data for trend forecast")
    usage_forecast = None

## 3. Event-Augmented Forecast

In [None]:
# Load event impact model (simplified version if import failed)
class EventImpactModel:
    def __init__(self):
        self.impact_links = None
        self.events = None
    
    def load_data(self, impact_links_df, events_df):
        self.impact_links = impact_links_df.copy()
        self.events = events_df.copy()
    
    def get_indicator_impact(self, indicator, date):
        """Get total impact on indicator from all events up to date"""
        if self.impact_links is None or self.events is None:
            return 0.0
        
        date = pd.to_datetime(date)
        total_impact = 0.0
        
        # Get impact links for this indicator
        indicator_links = self.impact_links[
            self.impact_links['related_indicator'].str.contains(indicator, case=False, na=False)
        ]
        
        for _, link in indicator_links.iterrows():
            parent_id = link.get('parent_id')
            if parent_id is None:
                continue
            
            # Find event
            event = self.events[self.events.index.astype(str) == str(parent_id)]
            if len(event) == 0:
                continue
            
            event_date = pd.to_datetime(event.iloc[0].get('event_date', '2000-01-01'))
            months_since = (date.year - event_date.year) * 12 + (date.month - event_date.month)
            
            lag_months = link.get('lag_months', 0)
            if months_since < lag_months:
                continue
            
            impact_magnitude = link.get('impact_magnitude', 0)
            direction = link.get('impact_direction', 'positive')
            
            if direction == 'negative':
                impact_magnitude = -impact_magnitude
            
            total_impact += impact_magnitude
        
        return total_impact

# Initialize impact model
impact_model = EventImpactModel()
impact_model.load_data(impact_links, events)

# Create event-augmented forecasts
print("=" * 80)
print("EVENT-AUGMENTED FORECASTS")
print("=" * 80)

forecast_years = [2025, 2026, 2027]
acc_event_forecast = []
usage_event_forecast = []

for year in forecast_years:
    date = pd.to_datetime(f'{year}-12-31')
    
    # Get baseline trend
    if acc_forecast:
        baseline_acc = acc_forecast['model'].predict([[year]])[0]
        event_impact_acc = impact_model.get_indicator_impact('ACC_OWNERSHIP', date)
        acc_event_forecast.append({
            'year': year,
            'baseline': baseline_acc,
            'event_impact': event_impact_acc,
            'forecast': baseline_acc + event_impact_acc
        })
    
    if usage_forecast:
        baseline_usage = usage_forecast['model'].predict([[year]])[0]
        event_impact_usage = impact_model.get_indicator_impact('USG_DIGITAL', date)
        usage_event_forecast.append({
            'year': year,
            'baseline': baseline_usage,
            'event_impact': event_impact_usage,
            'forecast': baseline_usage + event_impact_usage
        })

if acc_event_forecast:
    print("\nAccount Ownership (Event-Augmented):")
    for f in acc_event_forecast:
        print(f"  {f['year']}: {f['forecast']:.1f}% (baseline: {f['baseline']:.1f}%, event impact: {f['event_impact']:+.1f}pp)")

if usage_event_forecast:
    print("\nDigital Payment Usage (Event-Augmented):")
    for f in usage_event_forecast:
        print(f"  {f['year']}: {f['forecast']:.1f}% (baseline: {f['baseline']:.1f}%, event impact: {f['event_impact']:+.1f}pp)")

In [None]:
# Define scenario multipliers
scenarios = {
    'optimistic': {
        'trend_multiplier': 1.2,  # 20% faster growth
        'event_impact_multiplier': 1.3,  # 30% stronger event impacts
        'description': 'Optimistic: Faster growth, stronger event impacts'
    },
    'base': {
        'trend_multiplier': 1.0,
        'event_impact_multiplier': 1.0,
        'description': 'Base: Current trend and event impacts'
    },
    'pessimistic': {
        'trend_multiplier': 0.8,  # 20% slower growth
        'event_impact_multiplier': 0.7,  # 30% weaker event impacts
        'description': 'Pessimistic: Slower growth, weaker event impacts'
    }
}

# Generate scenario forecasts
scenario_forecasts = {}

for scenario_name, params in scenarios.items():
    scenario_forecasts[scenario_name] = {
        'account_ownership': [],
        'digital_payment_usage': []
    }
    
    for year in forecast_years:
        # Account Ownership
        if acc_forecast:
            baseline = acc_forecast['model'].predict([[year]])[0]
            trend_adjusted = baseline * params['trend_multiplier']
            event_impact = impact_model.get_indicator_impact('ACC_OWNERSHIP', pd.to_datetime(f'{year}-12-31'))
            event_adjusted = event_impact * params['event_impact_multiplier']
            forecast_value = trend_adjusted + event_adjusted
            
            scenario_forecasts[scenario_name]['account_ownership'].append({
                'year': year,
                'forecast': forecast_value
            })
        
        # Digital Payment Usage
        if usage_forecast:
            baseline = usage_forecast['model'].predict([[year]])[0]
            trend_adjusted = baseline * params['trend_multiplier']
            event_impact = impact_model.get_indicator_impact('USG_DIGITAL', pd.to_datetime(f'{year}-12-31'))
            event_adjusted = event_impact * params['event_impact_multiplier']
            forecast_value = trend_adjusted + event_adjusted
            
            scenario_forecasts[scenario_name]['digital_payment_usage'].append({
                'year': year,
                'forecast': forecast_value
            })

# Display scenario forecasts
print("=" * 80)
print("SCENARIO FORECASTS")
print("=" * 80)

for scenario_name in ['optimistic', 'base', 'pessimistic']:
    print(f"\n{scenario_name.upper()} Scenario:")
    print(f"  {scenarios[scenario_name]['description']}")
    
    if scenario_forecasts[scenario_name]['account_ownership']:
        print("\n  Account Ownership:")
        for f in scenario_forecasts[scenario_name]['account_ownership']:
            print(f"    {f['year']}: {f['forecast']:.1f}%")
    
    if scenario_forecasts[scenario_name]['digital_payment_usage']:
        print("\n  Digital Payment Usage:")
        for f in scenario_forecasts[scenario_name]['digital_payment_usage']:
            print(f"    {f['year']}: {f['forecast']:.1f}%")

## 5. Visualize Forecasts

In [None]:
# Create forecast visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Account Ownership Forecast
if acc_forecast and len(acc_obs) > 0:
    ax = axes[0]
    
    # Historical data
    ax.plot(acc_obs['year'], acc_obs['value_numeric'], 'o-', 
           linewidth=2, markersize=8, label='Historical', color='steelblue')
    
    # Forecast years
    forecast_mask = acc_forecast['years'] >= 2025
    forecast_years_plot = acc_forecast['years'][forecast_mask]
    forecast_values_plot = acc_forecast['predictions'][forecast_mask]
    forecast_lower = acc_forecast['lower_ci'][forecast_mask]
    forecast_upper = acc_forecast['upper_ci'][forecast_mask]
    
    # Plot forecast with confidence intervals
    ax.plot(forecast_years_plot, forecast_values_plot, '--', 
           linewidth=2, label='Baseline Forecast', color='coral')
    ax.fill_between(forecast_years_plot, forecast_lower, forecast_upper, 
                    alpha=0.3, color='coral', label='95% CI')
    
    # Add scenario forecasts
    colors = {'optimistic': 'green', 'base': 'orange', 'pessimistic': 'red'}
    for scenario_name in ['optimistic', 'base', 'pessimistic']:
        if scenario_forecasts[scenario_name]['account_ownership']:
            scenario_years = [f['year'] for f in scenario_forecasts[scenario_name]['account_ownership']]
            scenario_values = [f['forecast'] for f in scenario_forecasts[scenario_name]['account_ownership']]
            ax.plot(scenario_years, scenario_values, 's-', 
                   linewidth=1.5, markersize=6, label=f'{scenario_name.capitalize()}', 
                   color=colors[scenario_name], alpha=0.7)
    
    ax.set_title('Account Ownership Forecast (2025-2027)', fontsize=14, fontweight='bold')
    ax.set_xlabel('Year', fontsize=12)
    ax.set_ylabel('Account Ownership Rate (%)', fontsize=12)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

# Digital Payment Usage Forecast
if usage_forecast and len(usage_obs) > 0:
    ax = axes[1]
    
    # Historical data
    ax.plot(usage_obs['year'], usage_obs['value_numeric'], 'o-', 
           linewidth=2, markersize=8, label='Historical', color='mediumseagreen')
    
    # Forecast years
    forecast_mask = usage_forecast['years'] >= 2025
    forecast_years_plot = usage_forecast['years'][forecast_mask]
    forecast_values_plot = usage_forecast['predictions'][forecast_mask]
    forecast_lower = usage_forecast['lower_ci'][forecast_mask]
    forecast_upper = usage_forecast['upper_ci'][forecast_mask]
    
    # Plot forecast with confidence intervals
    ax.plot(forecast_years_plot, forecast_values_plot, '--', 
           linewidth=2, label='Baseline Forecast', color='coral')
    ax.fill_between(forecast_years_plot, forecast_lower, forecast_upper, 
                    alpha=0.3, color='coral', label='95% CI')
    
    # Add scenario forecasts
    for scenario_name in ['optimistic', 'base', 'pessimistic']:
        if scenario_forecasts[scenario_name]['digital_payment_usage']:
            scenario_years = [f['year'] for f in scenario_forecasts[scenario_name]['digital_payment_usage']]
            scenario_values = [f['forecast'] for f in scenario_forecasts[scenario_name]['digital_payment_usage']]
            ax.plot(scenario_years, scenario_values, 's-', 
                   linewidth=1.5, markersize=6, label=f'{scenario_name.capitalize()}', 
                   color=colors[scenario_name], alpha=0.7)
    
    ax.set_title('Digital Payment Usage Forecast (2025-2027)', fontsize=14, fontweight='bold')
    ax.set_xlabel('Year', fontsize=12)
    ax.set_ylabel('Digital Payment Usage Rate (%)', fontsize=12)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
elif len(usage_obs) == 0:
    axes[1].text(0.5, 0.5, 'Insufficient data\nfor usage forecast', 
                ha='center', va='center', fontsize=12)
    axes[1].set_title('Digital Payment Usage Forecast', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(REPORTS_DIR / 'forecasts_2025_2027.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Forecast Summary Table

In [None]:
# Create forecast summary table
forecast_summary = []

for year in forecast_years:
    row = {'Year': year}
    
    # Account Ownership
    if acc_event_forecast:
        acc_f = next((f for f in acc_event_forecast if f['year'] == year), None)
        if acc_f:
            row['Account_Ownership_Base'] = f"{acc_f['forecast']:.1f}%"
            row['Account_Ownership_Optimistic'] = f"{next((f['forecast'] for f in scenario_forecasts['optimistic']['account_ownership'] if f['year'] == year), 0):.1f}%"
            row['Account_Ownership_Pessimistic'] = f"{next((f['forecast'] for f in scenario_forecasts['pessimistic']['account_ownership'] if f['year'] == year), 0):.1f}%"
    
    # Digital Payment Usage
    if usage_event_forecast:
        usage_f = next((f for f in usage_event_forecast if f['year'] == year), None)
        if usage_f:
            row['Digital_Payment_Base'] = f"{usage_f['forecast']:.1f}%"
            row['Digital_Payment_Optimistic'] = f"{next((f['forecast'] for f in scenario_forecasts['optimistic']['digital_payment_usage'] if f['year'] == year), 0):.1f}%"
            row['Digital_Payment_Pessimistic'] = f"{next((f['forecast'] for f in scenario_forecasts['pessimistic']['digital_payment_usage'] if f['year'] == year), 0):.1f}%"
    
    forecast_summary.append(row)

forecast_df = pd.DataFrame(forecast_summary)
print("=" * 80)
print("FORECAST SUMMARY TABLE (2025-2027)")
print("=" * 80)
print(forecast_df.to_string(index=False))

# Save to CSV
forecast_df.to_csv(PROCESSED_DIR / 'forecasts_2025_2027.csv', index=False)
print(f"\nForecast table saved to {PROCESSED_DIR / 'forecasts_2025_2027.csv'}")

## 7. Interpretation and Key Findings

### Model Predictions

**Account Ownership (Access):**
- Baseline trend suggests continued growth
- Event impacts (especially M-Pesa) expected to accelerate growth
- Scenario range reflects uncertainty in event impacts and trend continuation

**Digital Payment Usage:**
- Strong growth trajectory expected
- Event impacts may be more pronounced for usage than access
- Higher uncertainty due to limited historical data

### Events with Largest Potential Impact

1. **M-Pesa Entry (Aug 2023)**: Competition effect expected to boost both access and usage
2. **Telebirr Expansion**: Continued growth in mobile money adoption
3. **Infrastructure Investments**: Network and agent expansion supporting usage

### Key Uncertainties

1. **Data Limitations**: Sparse time series (only 5 Findex points) increases forecast uncertainty
2. **Event Impact Estimates**: Many based on comparable country evidence, not Ethiopian data
3. **Recent Events**: M-Pesa too recent to fully assess impact
4. **External Factors**: Economic conditions, policy changes, infrastructure investments not fully captured
5. **Registered vs. Active Gap**: Large gap between registered accounts and survey-reported ownership

### Recommendations

1. **Monitor Key Indicators**: Track mobile money active accounts, transaction volumes
2. **Validate Forecasts**: Compare 2025 forecasts with actual data when available
3. **Refine Model**: Incorporate additional data as it becomes available
4. **Scenario Planning**: Use scenario range for planning, not point estimates