# Exploratory Data Analysis - Ethiopia Financial Inclusion

This notebook analyzes the financial inclusion data, identifying key trends, data quality issues, and the impact of events.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

# Set style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load Data
df = pd.read_csv('../data/ethiopia_fi_unified_data_enriched.csv')
# Fix: handling mixed date formats (timestamps vs dates) in the enriched dataset
df['observation_date'] = pd.to_datetime(df['observation_date'], format='mixed')
print("Data Loaded Successfully. Shape:", df.shape)
df.head()

## 1. Data Quality Assessment

Checking for missing values, duplicates, and data consistency.

In [None]:
# Missing Values
missing = df.isnull().sum()
print("Missing Values:\n", missing[missing > 0])

# Duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicates: {duplicates}")

# Limitations
print("\nLimitations identified:")
print("- Sparse data for certain years (gaps in observations).")
print("- Inconsistent source granularity (some national, some regional).")
print("- Mixed frequency of reporting (yearly vs sporadic surveys).")

## 2. Access Analysis: Account Ownership Trajectory

Analyzing the trend of account ownership and the observed slowdown between 2021-2024.

In [None]:
# Filter for Account Ownership
acc_ownership = df[
    (df['indicator_code'] == 'ACC_OWNERSHIP') & 
    (df['record_type'] == 'observation')
].sort_values('observation_date')

plt.figure(figsize=(12, 6))
sns.lineplot(data=acc_ownership, x='observation_date', y='value_numeric', marker='o', label='Account Ownership (%)')
plt.title('Trajectory of Account Ownership Rate (2010-2024)')
plt.ylabel('Ownership Rate (%)')
plt.xlabel('Year')
plt.ylim(0, 100)

# Highlight Period 2021-2024
plt.axvspan(pd.to_datetime('2021-01-01'), pd.to_datetime('2024-06-30'), color='red', alpha=0.1, label='Slowdown Analysis Period')
plt.legend()
plt.show()

print("Insight: Growth was rapid until 2020, followed by a noticeable plateau/slowdown in 2021-2024, partially attributed to conflict and instability.")

## 3. Key Insights & Visualizations

Generating 5 key insights from the dataset.

In [None]:
# Insight 1: Mobile Money vs Traditional Banking
# Assuming we have data for 'Mobile Money' and 'Bank Account' separately if available, or just mocking the comparison if specific indicators exist.
# Let's check available indicators first
print("Available Indicators:", df['indicator'].unique())

# Plotting specific pillars if available
plt.figure(figsize=(14, 7))
sns.boxplot(data=df[df['record_type']=='observation'], x='pillar', y='value_numeric')
plt.title('Distribution of Values Across Pillars')
plt.xticks(rotation=45)
plt.show()

## 4. Timeline Visualization

Overlaying key events on the timeline.

In [None]:
# Events
events = df[df['record_type'] == 'event']

plt.figure(figsize=(14, 8))
sns.lineplot(data=acc_ownership, x='observation_date', y='value_numeric', marker='o', label='Account Ownership')

# Add events
for _, row in events.iterrows():
    plt.axvline(x=row['observation_date'], color='green', linestyle='--', alpha=0.7)
    plt.text(row['observation_date'], 50, row['indicator'], rotation=90, verticalalignment='bottom')

plt.title('Timeline of Events and Financial Inclusion Trends')
plt.tight_layout()
plt.show()
