In [None]:
# Combining all the 4 biometric datasets csv into one Biometric_updates dataset for 2025
import pandas as pd

# List of biometric update CSV files
bio_files = [
    "/content/api_data_aadhar_biometric_0_500000.csv",
    "/content/api_data_aadhar_biometric_1000000_1500000.csv",
    "/content/api_data_aadhar_biometric_1500000_1861108.csv",
    "/content/api_data_aadhar_biometric_500000_1000000.csv"
]

# Read and combine all files
bio_df = pd.concat(
    [pd.read_csv(file) for file in bio_files],
    ignore_index=True
)

# Standardize column names (safety step)
bio_df.columns = bio_df.columns.str.strip().str.lower()

# Convert date to datetime
bio_df["date"] = pd.to_datetime(bio_df["date"], dayfirst=True)

# Optional: sort for consistency
bio_df = bio_df.sort_values(["state", "district", "pincode", "date"])



In [None]:
# Combining all the 5 demographic datasets csv into one Demographic_updates dataset for 2025
import pandas as pd

# List of demographic update CSV files
demo_files = [
    "/content/api_data_aadhar_demographic_0_500000.csv",
    "/content/api_data_aadhar_demographic_1000000_1500000.csv",
    "/content/api_data_aadhar_demographic_1500000_2000000.csv",
    "/content/api_data_aadhar_demographic_2000000_2071700.csv",
    "/content/api_data_aadhar_demographic_500000_1000000.csv"
]

# Read and combine all files
demo_df = pd.concat(
    [pd.read_csv(file) for file in demo_files],
    ignore_index=True
)

# Standardize column names
demo_df.columns = demo_df.columns.str.strip().str.lower()

# Convert date to datetime
demo_df["date"] = pd.to_datetime(demo_df["date"], dayfirst=True)

# Optional: sort for consistency
demo_df = demo_df.sort_values(["state", "district", "pincode", "date"])



In [None]:
# Save combined dataset
demo_df.to_csv("demographic_updates_2025_df.csv", index=False)

print("Demographic datasets combined successfully.")
print("Shape:", demo_df.shape)


Demographic datasets combined successfully.
Shape: (2071700, 6)


In [None]:
# Save combined file
bio_df.to_csv("biometric_updates_2025_df.csv", index=False)

print("Biometric datasets combined successfully.")
print("Shape:", bio_df.shape)


Biometric datasets combined successfully.
Shape: (1861108, 6)


In [None]:
from google.colab import files
files.download('/content/demographic_updates_2025_df.csv')
files.download('/content/biometric_updates_2025_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Statewise Monthly Update Aggregation (Biometric & Demographic)**


This step aggregates Aadhaar **update activity at the state level**, month by month, for the year 2025.

**Aggregation procedure:**
- Biometric and demographic update datasets are processed separately.
- For each state, total update counts are calculated for every month.
- An additional **cumulative column** is added to capture the total updates across all months.

**Analysis:**
- Helps identify states with consistently high or low update activity.
- Reveals seasonal or policy-driven spikes in update behaviour.
- Acts as a baseline to compare update intensity across states.

**Output tables created:**
- Statewise monthly biometric update table (12 months + cumulative total)
- Statewise monthly demographic update table (12 months + cumulative total)

These tables form the foundation for deeper cohort-level and risk analysis.


In [2]:
import pandas as pd

# Load datasets
bio_df = pd.read_csv("biometric_updates_2025_df.csv")
demo_df = pd.read_csv("demographic_updates_2025_df.csv")

# date parsing for mixed formats
bio_df["date"] = pd.to_datetime(bio_df["date"], format="mixed", dayfirst=True)
demo_df["date"] = pd.to_datetime(demo_df["date"], format="mixed", dayfirst=True)

# Extract month name
bio_df["month"] = bio_df["date"].dt.strftime("%b")
demo_df["month"] = demo_df["date"].dt.strftime("%b")



In [3]:
#Create TOTAL updates column (all cohorts combined)

# Total biometric updates (all ages)
bio_df["bio_total_updates"] = (
    bio_df["bio_age_5_17"] + bio_df["bio_age_17_"]
)

# Total demographic updates (all ages)
demo_df["demo_total_updates"] = (
    demo_df["demo_age_5_17"] + demo_df["demo_age_17_"]
)


In [4]:
# Statewise MONTHLY aggregation (Biometric)

bio_state_monthly = (
    bio_df
    .groupby(["state", "month"])["bio_total_updates"]
    .sum()
    .reset_index()
)

bio_state_pivot = (
    bio_state_monthly
    .pivot(index="state", columns="month", values="bio_total_updates")
    .fillna(0)
)

# Ensure correct month order
month_order = ["Jan","Feb","Mar","Apr","May","Jun",
               "Jul","Aug","Sep","Oct","Nov","Dec"]
bio_state_pivot = bio_state_pivot.reindex(columns=month_order, fill_value=0)

# Add cumulative column
bio_state_pivot["Total_2025"] = bio_state_pivot.sum(axis=1)

bio_state_pivot.head()


month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total_2025
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Andaman & Nicobar Islands,0,0,209.0,184.0,180.0,140.0,259.0,0,429.0,252.0,303.0,428.0,2384.0
Andaman and Nicobar Islands,0,0,2494.0,2560.0,1715.0,1804.0,2569.0,0,2324.0,1242.0,1519.0,2087.0,18314.0
Andhra Pradesh,0,0,403296.0,608589.0,500660.0,508472.0,334226.0,0,225491.0,356227.0,394700.0,382931.0,3714592.0
Arunachal Pradesh,0,0,7400.0,8375.0,8305.0,7138.0,8442.0,0,7740.0,6292.0,10634.0,8068.0,72394.0
Assam,0,0,92931.0,139659.0,83841.0,71576.0,95172.0,0,98559.0,134076.0,114216.0,152692.0,982722.0


In [5]:
# Statewise MONTHLY aggregation (Demographic)

demo_state_monthly = (
    demo_df
    .groupby(["state", "month"])["demo_total_updates"]
    .sum()
    .reset_index()
)

demo_state_pivot = (
    demo_state_monthly
    .pivot(index="state", columns="month", values="demo_total_updates")
    .fillna(0)
)

# Ensure correct month order
demo_state_pivot = demo_state_pivot.reindex(columns=month_order, fill_value=0)

# Add cumulative column
demo_state_pivot["Total_2025"] = demo_state_pivot.sum(axis=1)

demo_state_pivot.head()


month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total_2025
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100000,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2.0,2.0
Andaman & Nicobar Islands,0,0,0.0,0.0,0.0,0.0,0.0,0,234.0,114.0,320.0,391.0,1059.0
Andaman and Nicobar Islands,0,0,1338.0,0.0,0.0,0.0,438.0,0,804.0,567.0,1249.0,1791.0,6187.0
Andhra Pradesh,0,0,513040.0,48881.0,155804.0,132318.0,87491.0,0,269255.0,239167.0,402102.0,447447.0,2295505.0
Arunachal Pradesh,0,0,7809.0,0.0,0.0,0.0,0.0,0,11035.0,4835.0,6674.0,6090.0,36443.0


In [10]:
#Save both statewise monthly updates aggregation tables

bio_state_pivot.to_csv(
    "statewise_bio_updates2025_monthly_aggregated.csv"
)

demo_state_pivot.to_csv(
    "statewise_demo_updates2025_monthly_aggregated.csv"
)

print("Statewise 12-month + cumulative tables created successfully.")


Statewise 12-month + cumulative tables created successfully.


## **Cohort-wise State Aggregation of Updates**

This step breaks down statewise update activity by **age cohorts**, separately for demographic and biometric updates.

**Age cohorts analysed:**

- 5–17 years
- 18+ years

**Aggregation procedure:**
- Monthly update counts are aggregated at the state level for each cohort, biometric and demographic updates separately
- Separate aggregation tables are created for each cohort  
- Each table captures how updates are distributed across age groups over time.

**Analysis:**
- Highlights lifecycle-specific update behaviour.
- Helps detect delayed updates and potential compliance gaps.
- Enables targeted analysis of child,adolescent, and adult update patterns.

**Outputs created (4 tables total):**
- Demographic updates: 2 cohort-wise state tables
- Biometric updates: 2 cohort-wise state tables

These cohort-level tables are key inputs for update risk and trend analysis .




In [13]:
import pandas as pd

# Load datasets
bio_df = pd.read_csv("biometric_updates_2025_df.csv")
demo_df = pd.read_csv("demographic_updates_2025_df.csv")

# date parsing for mixed formats
bio_df["date"] = pd.to_datetime(bio_df["date"], format="mixed", dayfirst=True)
demo_df["date"] = pd.to_datetime(demo_df["date"], format="mixed", dayfirst=True)

# Extract month name
bio_df["month"] = bio_df["date"].dt.strftime("%b")
demo_df["month"] = demo_df["date"].dt.strftime("%b")

month_order = ["Jan","Feb","Mar","Apr","May","Jun",
               "Jul","Aug","Sep","Oct","Nov","Dec"]


DEMOGRAPHIC- COHORTWISE TABLES

In [16]:
# 1. Demographic – Age 5–17
demo_5_17 = (
    demo_df
    .groupby(["state", "month"])["demo_age_5_17"]
    .sum()
    .reset_index()
)

demo_5_17_pivot = (
    demo_5_17
    .pivot(index="state", columns="month", values="demo_age_5_17")
    .fillna(0)
    .reindex(columns=month_order, fill_value=0)
)

demo_5_17_pivot["Total_2025"] = demo_5_17_pivot.sum(axis=1)


In [18]:
# 3. Demographic – Age 18+

demo_18_plus = (
    demo_df
    .groupby(["state", "month"])["demo_age_17_"]
    .sum()
    .reset_index()
)

demo_18_plus_pivot = (
    demo_18_plus
    .pivot(index="state", columns="month", values="demo_age_17_")
    .fillna(0)
    .reindex(columns=month_order, fill_value=0)
)

demo_18_plus_pivot["Total_2025"] = demo_18_plus_pivot.sum(axis=1)


BIOMETRIC- COHORTWISE TABLES

In [21]:
# 1.Biometric – Age 5-17

bio_5_17 = (
    bio_df
    .groupby(["state", "month"])["bio_age_5_17"]
    .sum()
    .reset_index()
)

bio_5_17_pivot = (
    bio_5_17
    .pivot(index="state", columns="month", values="bio_age_5_17")
    .fillna(0)
    .reindex(columns=month_order, fill_value=0)
)

bio_5_17_pivot["Total_2025"] = bio_5_17_pivot.sum(axis=1)



In [22]:
# 2. Biometric – Age 18+

bio_18_plus = (
    bio_df
    .groupby(["state", "month"])["bio_age_17_"]
    .sum()
    .reset_index()
)

bio_18_plus_pivot = (
    bio_18_plus
    .pivot(index="state", columns="month", values="bio_age_17_")
    .fillna(0)
    .reindex(columns=month_order, fill_value=0)
)

bio_18_plus_pivot["Total_2025"] = bio_18_plus_pivot.sum(axis=1)


In [25]:
# Save all 4 cohortwise aggregation tables


demo_5_17_pivot.to_csv("cohort_5_17_demo_updates_aggregated.csv")
demo_18_plus_pivot.to_csv("cohort_18_plus_demo_updates_aggregated.csv")


bio_5_17_pivot.to_csv("cohort_5_17_bio_updates_aggregated.csv")
bio_18_plus_pivot.to_csv("cohort_18_plus_bio_updates_aggregated.csv")


The 6 aggregated tables are preprocessed and cleaned in excel :


*   Inconsistent naming of sates is corrected
*   Aggregation recalculation is done

As a result,aggregated tables obtained are ,

- Demographic updates — Age 5–17
- Demographic updates — Age 18+
- Biometric updates — Age 5–17
- Biometric updates — Age 18+
- Statewise monthly total demographic updates( all cohorts together)
- Statewise monthly total biometric updates ( all cohorts together)

These 6 aggregated tables form the foundation for updates-risk and trend analysis



