# Task 1: Data Exploration and Enrichment

## Objective
Understand the starter dataset and enrich it with additional data useful for forecasting financial inclusion in Ethiopia.

## 1. Load and Explore the Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Paths
DATA_DIR = Path('../data/raw')
PROCESSED_DIR = Path('../data/processed')
PROCESSED_DIR.mkdir(exist_ok=True)

In [None]:
# Load the unified dataset
df = pd.read_csv(DATA_DIR / 'ethiopia_fi_unified_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Load reference codes
ref_codes = pd.read_csv(DATA_DIR / 'reference_codes.csv')
print(f"Reference codes shape: {ref_codes.shape}")
ref_codes.head(20)

## 2. Understand the Schema

In [None]:
# Count records by record_type
print("=== Record Type Distribution ===")
record_counts = df['record_type'].value_counts()
print(record_counts)
print(f"\nTotal records: {len(df)}")

In [None]:
# Explore structure by record type
for record_type in df['record_type'].unique():
    print(f"\n=== {record_type.upper()} Records ===")
    subset = df[df['record_type'] == record_type]
    print(f"Count: {len(subset)}")
    if len(subset) > 0:
        print(f"\nSample record:")
        print(subset.iloc[0].to_dict())

In [None]:
# Check pillar distribution
if 'pillar' in df.columns:
    print("=== Pillar Distribution ===")
    print(df['pillar'].value_counts())
    
# Check source_type distribution
if 'source_type' in df.columns:
    print("\n=== Source Type Distribution ===")
    print(df['source_type'].value_counts())
    
# Check confidence distribution
if 'confidence' in df.columns:
    print("\n=== Confidence Distribution ===")
    print(df['confidence'].value_counts())

## 3. Temporal Analysis

In [None]:
# Identify temporal range
date_cols = [col for col in df.columns if 'date' in col.lower()]
print(f"Date columns found: {date_cols}")

for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        print(f"\n{col}:")
        print(f"  Range: {df[col].min()} to {df[col].max()}")
        print(f"  Non-null count: {df[col].notna().sum()}")

## 4. Indicator Coverage

In [None]:
# List all unique indicators
if 'indicator_code' in df.columns:
    indicators = df[df['record_type'] == 'observation']['indicator_code'].unique()
    print(f"Unique indicators: {len(indicators)}")
    print("\nIndicators:")
    for ind in sorted(indicators):
        count = len(df[(df['record_type'] == 'observation') & (df['indicator_code'] == ind)])
        print(f"  {ind}: {count} observations")

## 5. Event Analysis

In [None]:
# Explore events
events = df[df['record_type'] == 'event'].copy()
print(f"Total events: {len(events)}")

if 'category' in events.columns:
    print("\nEvent categories:")
    print(events['category'].value_counts())

if 'event_date' in events.columns or 'observation_date' in events.columns:
    date_col = 'event_date' if 'event_date' in events.columns else 'observation_date'
    events[date_col] = pd.to_datetime(events[date_col], errors='coerce')
    print("\nEvents timeline:")
    print(events[[date_col, 'category']].sort_values(date_col))

## 6. Impact Links Analysis

In [None]:
# Explore impact_links
impact_links = df[df['record_type'] == 'impact_link'].copy()
print(f"Total impact links: {len(impact_links)}")

if 'parent_id' in impact_links.columns:
    print("\nImpact links by event:")
    print(impact_links['parent_id'].value_counts())

if 'impact_direction' in impact_links.columns:
    print("\nImpact directions:")
    print(impact_links['impact_direction'].value_counts())
    
if 'pillar' in impact_links.columns:
    print("\nImpact links by pillar:")
    print(impact_links['pillar'].value_counts())

## 7. Data Quality Assessment

In [None]:
# Check for missing values
print("=== Missing Values by Column ===")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False))

## 8. Data Enrichment

In this section, we'll add new observations, events, and impact_links that will be useful for forecasting.

In [None]:
# Create enriched dataset
df_enriched = df.copy()

# Track new records
new_records = []

### 8.1 Add Additional Observations

Based on the Additional Data Points Guide, we'll add:
- Infrastructure data (4G coverage, mobile penetration, ATM density)
- Active account metrics
- Agent network data
- Transaction volume data

In [None]:
# Example: Add infrastructure observations
# Note: Replace with actual data from sources like ITU, GSMA, NBE reports

def add_observation(record_type, pillar, indicator, indicator_code, value_numeric, 
                   observation_date, source_name, source_url, confidence='medium',
                   original_text='', notes='', collected_by='Data Team', collection_date='2026-02-01'):
    """Helper function to add new observations following the schema"""
    new_record = {
        'record_type': record_type,
        'pillar': pillar,
        'indicator': indicator,
        'indicator_code': indicator_code,
        'value_numeric': value_numeric,
        'observation_date': observation_date,
        'source_name': source_name,
        'source_url': source_url,
        'confidence': confidence,
        'original_text': original_text,
        'notes': notes,
        'collected_by': collected_by,
        'collection_date': collection_date
    }
    return new_record

# Add sample infrastructure observations (replace with real data)
# These are examples - you should replace with actual data from ITU, GSMA, NBE, etc.

print("Adding new observations...")
print("Note: Replace sample data with actual data from sources")

### 8.2 Add Additional Events

Add important events that aren't yet captured:
- Regulatory changes
- Infrastructure investments
- Partnership announcements
- Market milestones

In [None]:
def add_event(record_type, category, event_date, event_name, description,
             source_name, source_url, confidence='medium', original_text='',
             notes='', collected_by='Data Team', collection_date='2026-02-01'):
    """Helper function to add new events following the schema"""
    new_record = {
        'record_type': record_type,
        'category': category,
        'event_date': event_date,
        'event_name': event_name,
        'description': description,
        'source_name': source_name,
        'source_url': source_url,
        'confidence': confidence,
        'original_text': original_text,
        'notes': notes,
        'collected_by': collected_by,
        'collection_date': collection_date
        # Note: pillar should be left empty for events
    }
    return new_record

print("Adding new events...")
print("Note: Replace sample data with actual events from news, reports, etc.")

### 8.3 Add Additional Impact Links

Create relationships between events and indicators based on:
- Comparable country evidence
- Expert analysis
- Historical patterns

In [None]:
def add_impact_link(record_type, parent_id, pillar, related_indicator, 
                    impact_direction, impact_magnitude, lag_months, evidence_basis,
                    source_name, source_url, confidence='medium', original_text='',
                    notes='', collected_by='Data Team', collection_date='2026-02-01'):
    """Helper function to add new impact links following the schema"""
    new_record = {
        'record_type': record_type,
        'parent_id': parent_id,
        'pillar': pillar,
        'related_indicator': related_indicator,
        'impact_direction': impact_direction,
        'impact_magnitude': impact_magnitude,
        'lag_months': lag_months,
        'evidence_basis': evidence_basis,
        'source_name': source_name,
        'source_url': source_url,
        'confidence': confidence,
        'original_text': original_text,
        'notes': notes,
        'collected_by': collected_by,
        'collection_date': collection_date
    }
    return new_record

print("Adding new impact links...")
print("Note: Replace with actual impact estimates based on evidence")

## 9. Save Enriched Dataset

In [None]:
# Convert new records to DataFrame and append
if new_records:
    new_df = pd.DataFrame(new_records)
    # Ensure all columns match
    for col in df_enriched.columns:
        if col not in new_df.columns:
            new_df[col] = None
    df_enriched = pd.concat([df_enriched, new_df], ignore_index=True)
    print(f"Added {len(new_records)} new records")
    print(f"Total records: {len(df_enriched)}")
else:
    print("No new records added yet. Add records using the helper functions above.")

# Save enriched dataset
df_enriched.to_csv(PROCESSED_DIR / 'ethiopia_fi_unified_data_enriched.csv', index=False)
print(f"\nEnriched dataset saved to {PROCESSED_DIR / 'ethiopia_fi_unified_data_enriched.csv'}")