In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)


In [1]:
import pandas as pd
import glob
import os

ENROLMENT_PATH = r"D:\PROJECT\UIDAI hackathon\data\api_data_aadhar_enrolment"

files = glob.glob(os.path.join(ENROLMENT_PATH, "*.csv"))

print(f"Found {len(files)} enrolment files")

dfs = []
for f in files:
    print("Loading:", os.path.basename(f))
    df = pd.read_csv(f)
    dfs.append(df)

enrolment_df = pd.concat(dfs, ignore_index=True)

print("Enrolment DF shape:", enrolment_df.shape)

enrolment_df.head()


Found 3 enrolment files
Loading: api_data_aadhar_enrolment_0_500000.csv
Loading: api_data_aadhar_enrolment_1000000_1006029.csv
Loading: api_data_aadhar_enrolment_500000_1000000.csv
Enrolment DF shape: (1006029, 7)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [2]:
enrolment_df.info()
print(enrolment_df.columns)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   date            1006029 non-null  object
 1   state           1006029 non-null  object
 2   district        1006029 non-null  object
 3   pincode         1006029 non-null  int64 
 4   age_0_5         1006029 non-null  int64 
 5   age_5_17        1006029 non-null  int64 
 6   age_18_greater  1006029 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 53.7+ MB
Index(['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17',
       'age_18_greater'],
      dtype='object')


In [3]:
# Convert date to datetime
enrolment_df['date'] = pd.to_datetime(
    enrolment_df['date'],
    format="%d-%m-%Y",
    errors='coerce'
)

# Drop invalid dates (if any)
enrolment_df = enrolment_df.dropna(subset=['date'])

# Normalize state names
enrolment_df['state'] = enrolment_df['state'].str.strip().str.title()

# Extract year and month
enrolment_df['year'] = enrolment_df['date'].dt.year.astype(int)
enrolment_df['month'] = enrolment_df['date'].dt.month.astype(int)

enrolment_df.head()


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,year,month
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,2025,3
1,2025-03-09,Karnataka,Bengaluru Urban,560043,14,33,39,2025,3
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,29,82,12,2025,3
3,2025-03-09,Uttar Pradesh,Aligarh,202133,62,29,15,2025,3
4,2025-03-09,Karnataka,Bengaluru Urban,560016,14,16,21,2025,3


In [4]:
enrolment_age_summary = enrolment_df[
    ['age_0_5', 'age_5_17', 'age_18_greater']
].sum()

enrolment_age_summary


age_0_5           3546965
age_5_17          1720384
age_18_greater     168353
dtype: int64

In [5]:
total_enrolments = enrolment_age_summary.sum()

enrolment_age_percentage = (
    enrolment_age_summary / total_enrolments * 100
).round(2)

enrolment_age_percentage


age_0_5           65.25
age_5_17          31.65
age_18_greater     3.10
dtype: float64

In [None]:
age_dist = enrolment_age_summary_percentage.reset_index()
age_dist.columns = ["Age Group", "Percentage"]

plt.figure()
sns.barplot(data=age_dist, x="Age Group", y="Percentage")
plt.title("Age-wise Aadhaar Enrolment Distribution (%)")
plt.ylabel("Percentage of Enrolments")
plt.xlabel("Age Group")
plt.show()


In [6]:
monthly_enrolment = (
    enrolment_df
    .groupby('month')[['age_0_5', 'age_5_17', 'age_18_greater']]
    .sum()
)

monthly_enrolment



Unnamed: 0_level_0,age_0_5,age_5_17,age_18_greater
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,5367,7407,3808
4,141154,91371,24913
5,95342,71690,16584
6,98943,99911,16880
7,318352,263333,35183
9,995612,465401,14866
10,562856,238958,16106
11,769156,297658,25193
12,560183,184655,14820


In [None]:
monthly_enrol = (
    enrolment_df
    .groupby("month")[["age_0_5", "age_5_17", "age_18_greater"]]
    .sum()
    .sum(axis=1)
)

plt.figure()
monthly_enrol.plot(marker="o")
plt.title("Monthly Aadhaar Enrolment Trend")
plt.ylabel("Total Enrolments")
plt.xlabel("Month")
plt.show()


In [7]:
state_enrolment = (
    enrolment_df
    .groupby('state')[['age_0_5', 'age_5_17', 'age_18_greater']]
    .sum()
)

state_enrolment['total_enrolments'] = (
    state_enrolment['age_0_5'] +
    state_enrolment['age_5_17'] +
    state_enrolment['age_18_greater']
)

state_enrolment.sort_values(
    by='total_enrolments',
    ascending=False
).head(10)


Unnamed: 0_level_0,age_0_5,age_5_17,age_18_greater,total_enrolments
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Uttar Pradesh,521045,479682,17902,1018629
Bihar,262875,334802,11908,609585
Madhya Pradesh,367990,116381,9599,493970
West Bengal,275400,91398,8510,375308
Maharashtra,278814,82116,8209,369139
Rajasthan,229780,113123,5555,348458
Gujarat,193031,71182,16336,280549
Assam,141235,66085,22877,230197
Karnataka,179262,33863,10110,223235
Tamil Nadu,182313,37227,1249,220789


Unnamed: 0_level_0,age_0_5,age_5_17,age_18_greater,total_enrolments,adult_ratio
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100000,0,1,217,218,0.995413
Meghalaya,21179,53305,35287,109771,0.32146
Dadra And Nagar Haveli And Daman And Diu,131,21,21,173,0.121387
Assam,141235,66085,22877,230197,0.09938
Mizoram,4136,1295,495,5926,0.08353
Nagaland,4512,9953,1122,15587,0.071983
Goa,1916,254,163,2333,0.069867
Gujarat,193031,71182,16336,280549,0.058229
Sikkim,1054,1047,106,2207,0.048029
Karnataka,179262,33863,10110,223235,0.045289


In [None]:
state_enrol = (
    enrolment_df
    .groupby("state")[["age_0_5", "age_5_17", "age_18_greater"]]
    .sum()
)
state_enrol["total"] = state_enrol.sum(axis=1)

top10 = state_enrol.sort_values("total", ascending=False).head(10)

plt.figure()
sns.barplot(y=top10.index, x=top10["total"])
plt.title("Top 10 States by Aadhaar Enrolment Volume")
plt.xlabel("Total Enrolments")
plt.ylabel("State")
plt.show()


In [10]:
enrolment_age_percentage.to_csv(
    "outputs/enrolment_age_distribution_percentage.csv"
)
