In [None]:
# ===============================
# Notebook 02 â€” Ghost Cohort & RaR Modeling
# Project Antyodaya (UIDAI)
# ===============================

import pandas as pd
import numpy as np
import os

pd.set_option("display.max_columns", None)


In [None]:
# -------------------------------
# Load Cleaned Data
# -------------------------------

df = pd.read_csv("../data/processed/combined_uidai_data.csv")

print("Loaded data shape:", df.shape)
print(df.head())

In [None]:
# Avoid divide-by-zero
df["pincode_count"] = df["pincode_count"].replace(0, 1)

# Update Density (per center proxy)
df["update_density"] = (
    df["child_updates_5_17"] / df["pincode_count"]
)

df.head()


In [None]:
# Districts performing below national median are treated as risk-prone
national_median_density = df["update_density"].median()

df["ghost_cohort_flag"] = np.where(
    df["update_density"] < national_median_density,
    1,
    0
)

df["ghost_cohort_flag"].value_counts()


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df[[
    "norm_update_density",
    "norm_pincode_density",
    "norm_age17_pressure"
]] = scaler.fit_transform(
    df[[
        "update_density",
        "pincode_count",
        "age_17_updates"
    ]]
)

df.head()


In [None]:
df["RaR_score"] = (
    0.4 * (1 - df["norm_update_density"]) +
    0.4 * (1 - df["norm_pincode_density"]) +
    0.2 * df["norm_age17_pressure"]
)


In [None]:
df["risk_category"] = pd.cut(
    df["RaR_score"],
    bins=[0, 0.33, 0.66, 1.0],
    labels=["Low Risk", "Medium Risk", "High Risk"]
)

df["risk_category"].value_counts()


In [None]:
district_risk = (
    df
    .groupby(["state", "district"], as_index=False)
    .agg({
        "RaR_score": "mean",
        "ghost_cohort_flag": "sum",
        "child_updates_5_17": "sum",
        "age_17_updates": "sum",
        "pincode_count": "mean"
    })
)

district_risk.head()



In [None]:
output_file = "../data/processed/rar_district_scores.csv"
district_risk.to_csv(output_file, index=False)

print("Saved RaR scores to:", output_file)

