In [1]:
import os
import pandas as pd
import glob

print("Current working directory:")
print(os.getcwd())


Current working directory:
D:\PROJECT\UIDAI hackathon\notebook


In [2]:
import sys
print(sys.executable)



D:\PROJECT\UIDAI hackathon\.venv\Scripts\python.exe


In [3]:
DEMOGRAPHIC_PATH = r"D:\PROJECT\UIDAI hackathon\data\api_data_aadhar_demographic"

files = glob.glob(os.path.join(DEMOGRAPHIC_PATH, "*.csv"))

print(f"Found {len(files)} files")

dfs = []
for f in files:
    print("Loading:", os.path.basename(f))
    dfs.append(pd.read_csv(f))

demographic_df = pd.concat(dfs, ignore_index=True)

demographic_df.head()


Found 5 files
Loading: api_data_aadhar_demographic_0_500000.csv
Loading: api_data_aadhar_demographic_1000000_1500000.csv
Loading: api_data_aadhar_demographic_1500000_2000000.csv
Loading: api_data_aadhar_demographic_2000000_2071700.csv
Loading: api_data_aadhar_demographic_500000_1000000.csv


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [4]:
# Convert date to datetime
demographic_df['date'] = pd.to_datetime(
    demographic_df['date'],
    format="%d-%m-%Y",
    errors='coerce'
)

# Drop invalid dates
demographic_df = demographic_df.dropna(subset=['date'])

# Normalize state names
demographic_df['state'] = demographic_df['state'].str.strip().str.title()

# Create year & month
demographic_df['year'] = demographic_df['date'].dt.year.astype(int)
demographic_df['month'] = demographic_df['date'].dt.month.astype(int)

demographic_df.head()


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,year,month
0,2025-03-01,Uttar Pradesh,Gorakhpur,273213,49,529,2025,3
1,2025-03-01,Andhra Pradesh,Chittoor,517132,22,375,2025,3
2,2025-03-01,Gujarat,Rajkot,360006,65,765,2025,3
3,2025-03-01,Andhra Pradesh,Srikakulam,532484,24,314,2025,3
4,2025-03-01,Rajasthan,Udaipur,313801,45,785,2025,3


In [5]:
monthly_demo = (
    demographic_df
    .groupby('month')[['demo_age_5_17', 'demo_age_17_']]
    .sum()
    .sort_index()
)

monthly_demo


Unnamed: 0_level_0,demo_age_5_17,demo_age_17_
month,Unnamed: 1_level_1,Unnamed: 2_level_1
3,976095,10171463
4,198744,1316928
5,204893,1361394
6,197767,1487795
7,288449,1932266
9,770788,6553272
10,479232,4531734
11,821273,8566328
12,926183,8510583


In [9]:
demographic_df['state'].value_counts().head(100)


state
Andhra Pradesh                              207740
Tamil Nadu                                  196857
West Bengal                                 168727
Uttar Pradesh                               167889
Maharashtra                                 162242
Karnataka                                   153957
Kerala                                      105515
Bihar                                        97621
Gujarat                                      96399
Odisha                                       92198
Rajasthan                                    89508
Telangana                                    89086
Madhya Pradesh                               76364
Assam                                        62834
Punjab                                       49611
Jharkhand                                    39653
Chhattisgarh                                 35726
Haryana                                      28554
Himachal Pradesh                             28037
Uttarakhand              

In [10]:
STATE_FIX = {
    "Orissa": "Odisha",
    "Pondicherry": "Puducherry",
    "Westbengal": "West Bengal",
    "West Bangal": "West Bengal",
    "West  Bengal": "West Bengal",
    "West Bengli": "West Bengal",
    "Jammu & Kashmir": "Jammu And Kashmir",
    "Andaman & Nicobar Islands": "Andaman And Nicobar Islands",
    "Daman & Diu": "Dadra And Nagar Haveli And Daman And Diu",
    "Daman And Diu": "Dadra And Nagar Haveli And Daman And Diu",
    "Dadra & Nagar Haveli": "Dadra And Nagar Haveli And Daman And Diu",
}


In [11]:
demographic_df['state'] = demographic_df['state'].replace(STATE_FIX)


In [12]:
state_counts = demographic_df['state'].value_counts()

valid_states = state_counts[state_counts > 1000].index


In [13]:
demographic_clean = demographic_df[
    demographic_df['state'].isin(valid_states)
]


In [14]:
#: Recompute Adult Ratio (Final, Correct)

state_demo_yearly = (
    demographic_clean
    .groupby('state')[['demo_age_5_17', 'demo_age_17_']]
    .sum()
)

state_demo_yearly['total_updates'] = (
    state_demo_yearly['demo_age_5_17'] + state_demo_yearly['demo_age_17_']
)

state_demo_yearly['adult_ratio'] = (
    state_demo_yearly['demo_age_17_'] / state_demo_yearly['total_updates']
)

state_demo_yearly.sort_values(by='adult_ratio', ascending=False).head(10)


Unnamed: 0_level_0,demo_age_5_17,demo_age_17_,total_updates,adult_ratio
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Maharashtra,273322,4781280,5054602,0.945926
West Bengal,242561,3630176,3872737,0.937367
Punjab,56866,825029,881895,0.935518
Jharkhand,99376,1301813,1401189,0.929077
Sikkim,1555,18785,20340,0.92355
Bihar,380023,4434327,4814350,0.921065
Kerala,61064,683888,744952,0.91803
Chhattisgarh,165207,1840227,2005434,0.91762
Assam,84480,928098,1012578,0.916569
Andaman And Nicobar Islands,617,6629,7246,0.91485


In [15]:
# ---------------------------------------------
# STATE-WISE DEMOGRAPHIC LOAD ANALYSIS
# ---------------------------------------------
# Goal:
# Identify which states have the highest number of
# demographic updates (both age groups combined).
# This reflects voluntary update demand.

state_demo_load = (
    demographic_clean
    .groupby('state')[['demo_age_5_17', 'demo_age_17_']]
    .sum()
)

# Combine both age groups to get total updates per state
state_demo_load['total_updates'] = (
    state_demo_load['demo_age_5_17'] +
    state_demo_load['demo_age_17_']
)

# Sort states by highest update load
state_demo_load = state_demo_load.sort_values(
    by='total_updates',
    ascending=False
)

# View top 10 states with highest demographic update load
state_demo_load.head(10)


Unnamed: 0_level_0,demo_age_5_17,demo_age_17_,total_updates
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Uttar Pradesh,790308,7752020,8542328
Maharashtra,273322,4781280,5054602
Bihar,380023,4434327,4814350
West Bengal,242561,3630176,3872737
Madhya Pradesh,407098,2505840,2912938
Rajasthan,257224,2560391,2817615
Andhra Pradesh,321148,1974434,2295582
Tamil Nadu,315638,1896590,2212228
Chhattisgarh,165207,1840227,2005434
Gujarat,208474,1615853,1824327


In [16]:
# ---------------------------------------------
# CHILD SHARE IN DEMOGRAPHIC UPDATES
# ---------------------------------------------
# Goal:
# Check where children (5â€“17) still contribute
# significantly to demographic updates.

state_demo_yearly['child_ratio'] = (
    state_demo_yearly['demo_age_5_17'] /
    state_demo_yearly['total_updates']
)

# Sort states where child contribution is highest
state_demo_yearly.sort_values(
    by='child_ratio',
    ascending=False
).head(10)


Unnamed: 0_level_0,demo_age_5_17,demo_age_17_,total_updates,adult_ratio,child_ratio
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Arunachal Pradesh,5783,30660,36443,0.841314,0.158686
Chandigarh,13133,70228,83361,0.842456,0.157544
Karnataka,264981,1430304,1695285,0.843695,0.156305
Telangana,242259,1387649,1629908,0.851366,0.148634
Puducherry,4696,28067,32763,0.856668,0.143332
Tamil Nadu,315638,1896590,2212228,0.857321,0.142679
Jammu And Kashmir,57873,349329,407202,0.857876,0.142124
Andhra Pradesh,321148,1974434,2295582,0.860102,0.139898
Madhya Pradesh,407098,2505840,2912938,0.860245,0.139755
Manipur,41464,260085,301549,0.862497,0.137503


In [17]:
# ---------------------------------------------
# MONTH-WISE ADULT DOMINANCE CHECK
# ---------------------------------------------
# Goal:
# Verify whether adult dominance in demographic
# updates is consistent across months.

monthly_demo_ratio = (
    demographic_clean
    .groupby('month')[['demo_age_5_17', 'demo_age_17_']]
    .sum()
)

# Calculate adult share for each month
monthly_demo_ratio['adult_ratio'] = (
    monthly_demo_ratio['demo_age_17_'] /
    (monthly_demo_ratio['demo_age_5_17'] +
     monthly_demo_ratio['demo_age_17_'])
)

monthly_demo_ratio


Unnamed: 0_level_0,demo_age_5_17,demo_age_17_,adult_ratio
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,975297,10169053,0.912485
4,198588,1315962,0.86888
5,204893,1361394,0.869186
6,197767,1487795,0.88267
7,288449,1932266,0.87011
9,770340,6551801,0.894793
10,478940,4530829,0.904399
11,820786,8564652,0.912547
12,925509,8507900,0.90189


In [19]:
# ---------------------------------------------
# SAVE FINAL DEMOGRAPHIC INSIGHTS
# ---------------------------------------------
# Goal:
# Save clean, final tables for reporting
# and later visualization/dashboarding.

state_demo_yearly.to_csv(
    "outputs/demographic_adult_ratio_by_state.csv",
    index_label="state"
)

state_demo_load.to_csv(
    "outputs/demographic_state_load.csv",
    index_label="state"
)

monthly_demo_ratio.to_csv(
    "outputs/demographic_monthly_adult_ratio.csv",
    index_label="month"
)
