# Task 1: Data Exploration and Enrichment

# Load & Inspect the Data

# Load Files

In [1]:
import pandas as pd

data_path = "../data/raw/ethiopia_fi_unified_data.csv"
ref_path = "../data/raw/reference_codes.csv"

df = pd.read_csv(data_path)
ref = pd.read_csv(ref_path)

print(df.shape)
print(ref.shape)

(43, 34)
(71, 4)


# Schema Inspection

In [2]:
df.info()
df.head()

<class 'pandas.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   record_id            43 non-null     str    
 1   record_type          43 non-null     str    
 2   category             10 non-null     str    
 3   pillar               33 non-null     str    
 4   indicator            43 non-null     str    
 5   indicator_code       43 non-null     str    
 6   indicator_direction  33 non-null     str    
 7   value_numeric        33 non-null     float64
 8   value_text           10 non-null     str    
 9   value_type           43 non-null     str    
 10  unit                 33 non-null     str    
 11  observation_date     43 non-null     str    
 12  period_start         10 non-null     str    
 13  period_end           10 non-null     str    
 14  fiscal_year          43 non-null     str    
 15  gender               43 non-null     str    
 16  loc

Unnamed: 0,record_id,record_type,category,pillar,indicator,indicator_code,indicator_direction,value_numeric,value_text,value_type,...,impact_direction,impact_magnitude,impact_estimate,lag_months,evidence_basis,comparable_country,collected_by,collection_date,original_text,notes
0,REC_0001,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,22.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Baseline year,
1,REC_0002,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,35.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
2,REC_0003,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,46.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
3,REC_0004,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,56.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,
4,REC_0005,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,36.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,


# 3️⃣ Exploratory Data Analysis (EDA)

## Step 3.1: Record Counts

In [3]:
df['record_type'].value_counts()

record_type
observation    30
event          10
target          3
Name: count, dtype: int64

In [4]:
df.groupby('pillar')['record_type'].count()

pillar
ACCESS           16
AFFORDABILITY     1
GENDER            5
USAGE            11
Name: record_type, dtype: int64

In [5]:
df['confidence'].value_counts()

confidence
high      40
medium     3
Name: count, dtype: int64

## Step 3.2: Time Coverage

In [6]:
df['observation_date'] = pd.to_datetime(df['observation_date'], errors='coerce')

df.groupby('record_type')['observation_date'].agg(['min','max'])

Unnamed: 0_level_0,min,max
record_type,Unnamed: 1_level_1,Unnamed: 2_level_1
event,2021-05-17,2025-12-18
observation,2014-12-31,2025-12-31
target,2025-12-31,2030-12-31


## Step 3.3: Indicator Inventory

In [7]:
indicators = (
    df[df['record_type'] == 'observation']
    [['indicator_code', 'indicator', 'pillar']]
    .drop_duplicates()
)

indicators

Unnamed: 0,indicator_code,indicator,pillar
0,ACC_OWNERSHIP,Account Ownership Rate,ACCESS
6,ACC_MM_ACCOUNT,Mobile Money Account Rate,ACCESS
8,ACC_4G_COV,4G Population Coverage,ACCESS
10,ACC_MOBILE_PEN,Mobile Subscription Penetration,ACCESS
11,ACC_FAYDA,Fayda Digital ID Enrollment,ACCESS
14,USG_P2P_COUNT,P2P Transaction Count,USAGE
16,USG_P2P_VALUE,P2P Transaction Value,USAGE
17,USG_ATM_COUNT,ATM Transaction Count,USAGE
18,USG_ATM_VALUE,ATM Transaction Value,USAGE
19,USG_CROSSOVER,P2P/ATM Crossover Ratio,USAGE


### Coverage by year:

In [8]:
df[df['record_type'] == 'observation'] \
  .groupby(['indicator_code', df['observation_date'].dt.year]) \
  .size()

indicator_code      observation_date
ACC_4G_COV          2023                1
                    2025                1
ACC_FAYDA           2024                1
                    2025                2
ACC_MM_ACCOUNT      2021                1
                    2024                1
ACC_MOBILE_PEN      2025                1
ACC_OWNERSHIP       2014                1
                    2017                1
                    2021                3
                    2024                1
AFF_DATA_INCOME     2024                1
GEN_GAP_ACC         2021                1
                    2024                1
GEN_GAP_MOBILE      2024                1
GEN_MM_SHARE        2024                1
USG_ACTIVE_RATE     2024                1
USG_ATM_COUNT       2025                1
USG_ATM_VALUE       2025                1
USG_CROSSOVER       2025                1
USG_MPESA_ACTIVE    2024                1
USG_MPESA_USERS     2024                1
USG_P2P_COUNT       2024               

## Step 3.4: Events Review

In [10]:
df.columns

Index(['record_id', 'record_type', 'category', 'pillar', 'indicator',
       'indicator_code', 'indicator_direction', 'value_numeric', 'value_text',
       'value_type', 'unit', 'observation_date', 'period_start', 'period_end',
       'fiscal_year', 'gender', 'location', 'region', 'source_name',
       'source_type', 'source_url', 'confidence', 'related_indicator',
       'relationship_type', 'impact_direction', 'impact_magnitude',
       'impact_estimate', 'lag_months', 'evidence_basis', 'comparable_country',
       'collected_by', 'collection_date', 'original_text', 'notes'],
      dtype='str')

In [12]:
events = df[df['record_type'] == 'event'].copy()
events[['record_id', 'indicator', 'category', 'observation_date']]


Unnamed: 0,record_id,indicator,category,observation_date
33,EVT_0001,Telebirr Launch,product_launch,2021-05-17
34,EVT_0002,Safaricom Ethiopia Commercial Launch,market_entry,2022-08-01
35,EVT_0003,M-Pesa Ethiopia Launch,product_launch,2023-08-01
36,EVT_0004,Fayda Digital ID Program Rollout,infrastructure,2024-01-01
37,EVT_0005,Foreign Exchange Liberalization,policy,2024-07-29
38,EVT_0006,P2P Transaction Count Surpasses ATM,milestone,2024-10-01
39,EVT_0007,M-Pesa EthSwitch Integration,partnership,2025-10-27
40,EVT_0008,EthioPay Instant Payment System Launch,infrastructure,2025-12-18
41,EVT_0009,NFIS-II Strategy Launch,policy,2021-09-01
42,EVT_0010,Safaricom Ethiopia Price Increase,pricing,2025-12-15


## Step 3.5: Impact Links Review

In [14]:
impact_links = df[df['record_type'] == 'impact_link']

impact_links[['record_id','pillar','related_indicator','impact_direction','lag_months']]

Unnamed: 0,record_id,pillar,related_indicator,impact_direction,lag_months


# 5️⃣ Data Enrichment — OBSERVATIONS
## Step 5.1: Create New Observations (Example)

In [15]:
new_observations = pd.DataFrame([
    {
        "record_type": "observation",
        "pillar": "usage",
        "indicator": "Smartphone penetration",
        "indicator_code": "smartphone_penetration",
        "value_numeric": 41,
        "observation_date": "2023-12-31",
        "source_name": "GSMA Mobile Economy SSA",
        "source_url": "https://www.gsma.com",
        "confidence": "medium",
        "original_text": "Smartphone adoption in Ethiopia reached approximately 41% in 2023.",
        "collected_by": "Kalkidan Alayu",
        "collection_date": "2026-01-29",
        "notes": "Key prerequisite for digital payments"
    }
])


### Append:

In [16]:
df = pd.concat([df, new_observations], ignore_index=True)

# 6️⃣ Data Enrichment — EVENTS
## Step 6.1: Add New Events

In [17]:
new_events = pd.DataFrame([
    {
        "record_type": "event",
        "event_name": "EthSwitch Interoperability Expansion",
        "category": "infrastructure",
        "event_date": "2023-06-01",
        "pillar": None,
        "source_name": "National Bank of Ethiopia",
        "source_url": "https://www.nbe.gov.et",
        "confidence": "high",
        "original_text": "EthSwitch enabled interoperable P2P transfers across banks and mobile money.",
        "collected_by": "Kalkidan Alayu",
        "collection_date": "2026-01-29",
        "notes": "Major catalyst for digital payment usage"
    }
])

### Append:

In [18]:
df = pd.concat([df, new_events], ignore_index=True)

# 7️⃣ Data Enrichment — IMPACT LINKS
## Step 7.1: Link Event → Indicator

In [19]:
event_id = df[df['event_name'] == "EthSwitch Interoperability Expansion"].index[0]

new_links = pd.DataFrame([
    {
        "record_type": "impact_link",
        "parent_id": event_id,
        "pillar": "usage",
        "related_indicator": "digital_payment_adoption",
        "impact_direction": "positive",
        "impact_magnitude": "large",
        "lag_months": 6,
        "evidence_basis": "Observed increases in interoperable payment usage in Kenya and India",
        "confidence": "medium"
    }
])

### Append:

In [20]:
df = pd.concat([df, new_links], ignore_index=True)

# 8️⃣ Validation Checks

In [22]:
ref.columns

Index(['field', 'code', 'description', 'applies_to'], dtype='str')

In [23]:
valid_categories = set(
    ref[ref['field'] == 'category']['code']
)

invalid_categories = set(df['category'].dropna()) - valid_categories
invalid_categories


set()

### Validate pillar

In [24]:
valid_pillars = set(
    ref[ref['field'] == 'pillar']['code']
)

set(df['pillar'].dropna()) - valid_pillars

{'usage'}

### Validate confidence

In [25]:
valid_confidence = set(
    ref[ref['field'] == 'confidence']['code']
)

set(df['confidence'].dropna()) - valid_confidence

set()

### Validate source_type

In [26]:
valid_sources = set(
    ref[ref['field'] == 'source_type']['code']
)

set(df['source_type'].dropna()) - valid_sources

set()

In [29]:
impact_links = df[df['record_type'] == 'impact_link']

orphan_links = impact_links.loc[
    ~impact_links['parent_id'].isin(valid_events)
]

orphan_links

Unnamed: 0,record_id,record_type,category,pillar,indicator,indicator_code,indicator_direction,value_numeric,value_text,value_type,...,lag_months,evidence_basis,comparable_country,collected_by,collection_date,original_text,notes,event_name,event_date,parent_id


In [28]:
events['category'].value_counts()

category
product_launch    2
infrastructure    2
policy            2
market_entry      1
milestone         1
partnership       1
pricing           1
Name: count, dtype: int64

# 9️⃣ Save Processed Dataset

In [30]:
output_path = "../data/processed/ethiopia_fi_unified_data_enriched.csv"
df.to_csv(output_path, index=False)