# 02. Age Demographics - Preprocessing

This notebook prepares the data for age-based demographic analysis. It extracts and cleans age-group columns from all datasets.

In [None]:
import pandas as pd
import sys
import os

# Add the parent directory to sys.path to import utils
sys.path.append(os.path.abspath('../../'))
from utils.data_loader import load_enrollment_data, load_demographic_data, load_biometric_data

print("Libraries imported successfully.")

## 1. Load Datasets

In [None]:
enr_df = load_enrollment_data('../../')
demo_df = load_demographic_data('../../')
bio_df = load_biometric_data('../../')

print("Data loaded.")

## 2. Consolidate Age Columns
We want to see how each age group is performing across enrollment and updates.

In [None]:
key_cols = ['state', 'district', 'pincode']

print("Aggregating data... (Optimized for speed)")

enr_age = enr_df.groupby(key_cols)[['age_0_5', 'age_5_17', 'age_18_greater']].sum(numeric_only=True).reset_index()
demo_age = demo_df.groupby(key_cols)[['demo_age_5_17', 'demo_age_17_']].sum(numeric_only=True).reset_index()
bio_age = bio_df.groupby(key_cols)[['bio_age_5_17', 'bio_age_17_']].sum(numeric_only=True).reset_index()

age_data = pd.merge(enr_age, demo_age, on=key_cols, how='outer')
age_data = pd.merge(age_data, bio_age, on=key_cols, how='outer')

age_data = age_data.fillna(0)

print("Age consolidation complete.")

## 3. Save Processed Data

In [None]:
os.makedirs('../../processed_data', exist_ok=True)
age_data.to_csv('../../processed_data/age_demographics_data.csv', index=False)
print("Processed data saved to processed_data/age_demographics_data.csv")