# 05. Predictive Analytics - Preprocessing

This notebook prepares the data for predictive modeling. Since we are avoiding temporal trends, we focus on saturation and growth metrics across regions.

In [None]:
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath('../../'))
from utils.data_loader import load_enrollment_data, load_demographic_data, load_biometric_data

print("Libraries imported successfully.")

## 1. Load and Merge Data

In [None]:
enr_df = load_enrollment_data('../../')
demo_df = load_demographic_data('../../')
bio_df = load_biometric_data('../../')

key_cols = ['state', 'district', 'pincode']
print("Aggregating data... (Optimized)")

enr_agg = enr_df.groupby(key_cols).sum(numeric_only=True).reset_index()
enr_agg['total_enrollments'] = enr_agg['age_0_5'] + enr_agg['age_5_17'] + enr_agg['age_18_greater']

demo_agg = demo_df.groupby(key_cols).sum(numeric_only=True).reset_index()
demo_agg['total_demo_updates'] = demo_agg['demo_age_5_17'] + demo_agg['demo_age_17_']

bio_agg = bio_df.groupby(key_cols).sum(numeric_only=True).reset_index()
bio_agg['total_bio_updates'] = bio_agg['bio_age_5_17'] + bio_agg['bio_age_17_']

predictive_data = pd.merge(enr_agg, demo_agg, on=key_cols, how='outer')
predictive_data = pd.merge(predictive_data, bio_agg, on=key_cols, how='outer')
predictive_data = predictive_data.fillna(0)

print("Data merged for predictive analysis.")

## 2. Feature Engineering for Prediction
We calculate features that can help predict future enrollment needs.

In [None]:
# Percent of updates relative to enrollments
predictive_data['update_intensity'] = (predictive_data['total_demo_updates'] + predictive_data['total_bio_updates']) / (predictive_data['total_enrollments'] + 1)

# Share of children enrollments (indicator of future adult updates)
predictive_data['child_enr_share'] = predictive_data['age_0_5'] / (predictive_data['total_enrollments'] + 1)

print("Predictive features engineered.")

## 3. Save Processed Data

In [None]:
os.makedirs('../../processed_data', exist_ok=True)
predictive_data.to_csv('../../processed_data/predictive_data.csv', index=False)
print("Processed data saved to processed_data/predictive_data.csv")