In [None]:
# ===============================
# Notebook 01 — Data Ingestion & Cleaning
# Project Antyodaya (UIDAI)
# ===============================

import pandas as pd
import numpy as np
import os
from glob import glob

pd.set_option("display.max_columns", None)

# -------------------------------
# Step 1 — Load All Raw UIDAI CSV Files
# -------------------------------

RAW_DATA_PATH = "../data/raw/"
raw_files = glob(os.path.join(RAW_DATA_PATH, "*.csv"))

print(f"Found {len(raw_files)} raw files")

df_list = []

for file in raw_files:
    temp_df = pd.read_csv(file)
    df_list.append(temp_df)

uidai_raw = pd.concat(df_list, ignore_index=True)

print("Combined raw shape:", uidai_raw.shape)
print(uidai_raw.head())


In [None]:
# -------------------------------
# Step 2 — Standardize Column Names
# -------------------------------

uidai_raw.columns = (
    uidai_raw.columns
    .str.lower()
    .str.strip()
)

print("Columns after standardization:")
print(uidai_raw.columns)


In [None]:
# -------------------------------
# Step 3 — Rename Columns for Clarity
# -------------------------------
# IMPORTANT:
# bio_age_5_17 = biometric updates for children aged 5–17 (NOT age-5 only)

uidai_raw.rename(columns={
    "bio_age_5_17": "child_updates_5_17",
    "bio_age_17_": "age_17_updates"
}, inplace=True)

print(uidai_raw.columns)

In [None]:
# -------------------------------
# Step 4 — Basic Data Cleaning
# -------------------------------

# Convert date → datetime
uidai_raw["date"] = pd.to_datetime(uidai_raw["date"], errors="coerce")

# Extract year if not reliable
uidai_raw["year"] = uidai_raw["date"].dt.year

# Handle missing numeric values
numeric_cols = [
    "child_updates_5_17",
    "age_17_updates"
]

for col in numeric_cols:
    uidai_raw[col] = uidai_raw[col].fillna(0)

# Remove rows with missing geography
uidai_raw.dropna(subset=["state", "district"], inplace=True)

print("Cleaned raw shape:", uidai_raw.shape)


In [None]:
# -------------------------------
# Step 5 — District-Year Aggregation
# -------------------------------
# This is the CORE dataset used everywhere later

district_year_df = (
    uidai_raw
    .groupby(["state", "district", "year"], as_index=False)
    .agg({
        "child_updates_5_17": "sum",
        "age_17_updates": "sum",
        "pincode": "nunique"
    })
)

district_year_df.rename(columns={
    "pincode": "pincode_count"
}, inplace=True)

print("District-Year shape:", district_year_df.shape)
district_year_df.head()


In [None]:
# -------------------------------
# Step 6 — Save Processed Dataset
# -------------------------------

PROCESSED_PATH = "../data/processed/"
os.makedirs(PROCESSED_PATH, exist_ok=True)

output_file = os.path.join(PROCESSED_PATH, "combined_uidai_data.csv")
district_year_df.to_csv(output_file, index=False)

print("Saved processed file to:", output_file)
print("Files in processed folder:", os.listdir(PROCESSED_PATH))
