# 01. Geographic Disparity - Preprocessing

This notebook prepares the data for geographic analysis. It merges enrollment and update datasets and aggregates them at the State and District levels.

In [None]:
import pandas as pd
import sys
import os

# Add the parent directory to sys.path to import utils
sys.path.append(os.path.abspath('../../'))
from utils.data_loader import load_enrollment_data, load_demographic_data, load_biometric_data

print("Libraries imported successfully.")

## 1. Load Datasets

In [None]:
enr_df = load_enrollment_data('../../')
demo_df = load_demographic_data('../../')
bio_df = load_biometric_data('../../')

print(f"Enrollment records: {len(enr_df)}")
print(f"Demographic update records: {len(demo_df)}")
print(f"Biometric update records: {len(bio_df)}")

## 2. Aggregate Data
We aggregate the data by State and District to find geographic trends.

In [None]:
key_cols = ['state', 'district', 'pincode']

print("Aggregating data... (Optimized for speed)")

# Aggregating with numeric_only=True is CRITICAL for speed
enr_geo = enr_df.groupby(key_cols).sum(numeric_only=True).reset_index()
demo_geo = demo_df.groupby(key_cols).sum(numeric_only=True).reset_index()
bio_geo = bio_df.groupby(key_cols).sum(numeric_only=True).reset_index()

print("Aggregation complete.")

## 3. Merge Datasets

In [None]:
merged_geo = pd.merge(enr_geo, demo_geo, on=key_cols, how='outer', suffixes=('_enr', '_demo'))
merged_geo = pd.merge(merged_geo, bio_geo, on=key_cols, how='outer', suffixes=('', '_bio'))

# Fill NaNs with 0 (since it means no enrollments/updates in that area)
merged_geo = merged_geo.fillna(0)

# Create composite metrics
merged_geo['total_updates'] = merged_geo['demo_age_5_17'] + merged_geo['demo_age_17_'] + \
                               merged_geo['bio_age_5_17'] + merged_geo['bio_age_17_']
merged_geo['total_enrollments'] = merged_geo['age_0_5'] + merged_geo['age_5_17'] + merged_geo['age_18_greater']

print("Merge complete.")

## 4. Save Processed Data

In [None]:
os.makedirs('../../processed_data', exist_ok=True)
merged_geo.to_csv('../../processed_data/geographic_data.csv', index=False)
print("Processed data saved to processed_data/geographic_data.csv")