In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [2]:
import os

biometric_path = "../final_pipeline_output/biometric"
biometric_files = os.listdir(biometric_path)

biometric_dfs = []

for file in biometric_files:
    df = pd.read_csv(biometric_path + f"/{file}")
    biometric_dfs.append(df)

biometric_df = pd.concat(biometric_dfs, ignore_index=True)

In [3]:
biometric_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu And Kashmir,Poonch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [4]:
biometric_df['state'].unique()

array(['Haryana', 'Bihar', 'Jammu And Kashmir', 'Tamil Nadu',
       'Maharashtra', 'Gujarat', 'Odisha', 'West Bengal', 'Kerala',
       'Rajasthan', 'Punjab', 'Himachal Pradesh', 'Uttar Pradesh',
       'Assam', 'Uttarakhand', 'Madhya Pradesh', 'Karnataka',
       'Andhra Pradesh', 'Telangana', 'Goa', 'Nagaland', 'Jharkhand',
       'Chhattisgarh', 'Meghalaya', 'Chandigarh', 'Puducherry', 'Ladakh',
       'Manipur', 'Delhi', 'Tripura', 'Mizoram', 'Arunachal Pradesh',
       'The Dadra And Nagar Haveli And Daman And Diu',
       'Andaman And Nicobar Islands', 'Sikkim'], dtype=object)

In [5]:
biometric_df['date'] = pd.to_datetime(
    biometric_df['date'],
    dayfirst=True,
    errors='coerce'
)

In [6]:
bio_cols = ['bio_age_5_17', 'bio_age_17_']
biometric_df[bio_cols] = biometric_df[bio_cols].fillna(0)

biometric_df = biometric_df.dropna(subset=['date', 'state', 'district'])

biometric_df['year'] = biometric_df['date'].dt.year
biometric_df['month'] = biometric_df['date'].dt.month

biometric_df['total_biometric_updates'] = (
    biometric_df['bio_age_5_17'] + biometric_df['bio_age_17_']
)

In [7]:
biometric_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,year,month,total_biometric_updates
0,2025-03-01,Haryana,Mahendragarh,123029,280,577,2025,3,857
1,2025-03-01,Bihar,Madhepura,852121,144,369,2025,3,513
2,2025-03-01,Jammu And Kashmir,Poonch,185101,643,1091,2025,3,1734
3,2025-03-01,Bihar,Bhojpur,802158,256,980,2025,3,1236
4,2025-03-01,Tamil Nadu,Madurai,625514,271,815,2025,3,1086


In [8]:
demo_path = "../final_pipeline_output/demographic"
demo_files = os.listdir(demo_path)

demo_dfs = []

for file in demo_files:
    df = pd.read_csv(demo_path + f"/{file}")
    demo_dfs.append(df)

demographic_df = pd.concat(demo_dfs, ignore_index=True)

In [9]:
demographic_df['state'].unique()

array(['Uttar Pradesh', 'Andhra Pradesh', 'Gujarat', 'Rajasthan',
       'Karnataka', 'West Bengal', 'Telangana', 'Odisha', 'Maharashtra',
       'Kerala', 'Bihar', 'Tamil Nadu', 'Madhya Pradesh', 'Assam',
       'Tripura', 'Arunachal Pradesh', 'Punjab', 'Jharkhand',
       'Chandigarh', 'Jammu And Kashmir', 'Mizoram', 'Nagaland',
       'Himachal Pradesh', 'Goa', 'Haryana', 'Meghalaya', 'Chhattisgarh',
       'Uttarakhand', 'Manipur',
       'The Dadra And Nagar Haveli And Daman And Diu', 'Delhi',
       'Puducherry', 'Ladakh', 'Andaman And Nicobar Islands', 'Sikkim'],
      dtype=object)

In [10]:
demographic_df['date'] = pd.to_datetime(
    demographic_df['date'],
    dayfirst=True,
    errors='coerce'
)

In [11]:
demographic_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,2025-03-01,Uttar Pradesh,Gorakhpur,273213,49,529
1,2025-03-01,Andhra Pradesh,Chittoor,517132,22,375
2,2025-03-01,Gujarat,Rajkot,360006,65,765
3,2025-03-01,Andhra Pradesh,Srikakulam,532484,24,314
4,2025-03-01,Rajasthan,Udaipur,313801,45,785


In [12]:
demo_cols = ['demo_age_5_17', 'demo_age_17_']
demographic_df[demo_cols] = demographic_df[demo_cols].fillna(0)

demographic_df = demographic_df.dropna(subset=['date', 'state', 'district'])

demographic_df['year'] = demographic_df['date'].dt.year
demographic_df['month'] = demographic_df['date'].dt.month

demographic_df['total_demo_updates'] = (
    demographic_df['demo_age_5_17'] +
    demographic_df['demo_age_17_']
)

In [13]:
demographic_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,year,month,total_demo_updates
0,2025-03-01,Uttar Pradesh,Gorakhpur,273213,49,529,2025,3,578
1,2025-03-01,Andhra Pradesh,Chittoor,517132,22,375,2025,3,397
2,2025-03-01,Gujarat,Rajkot,360006,65,765,2025,3,830
3,2025-03-01,Andhra Pradesh,Srikakulam,532484,24,314,2025,3,338
4,2025-03-01,Rajasthan,Udaipur,313801,45,785,2025,3,830


In [14]:
enrolment_path = "../data/enrolment/"
enrolment_files = os.listdir(enrolment_path)

enrolment_dfs = []

for file in enrolment_files:
    df = pd.read_csv(enrolment_path + file)
    enrolment_dfs.append(df)

enrolment_df = pd.concat(enrolment_dfs, ignore_index=True)

In [15]:
enrolment_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [24]:
enrolment_df['state'].unique()

array(['Meghalaya', 'Karnataka', 'Uttar Pradesh', 'Bihar', 'Maharashtra',
       'Haryana', 'Rajasthan', 'Punjab', 'Delhi', 'Madhya Pradesh',
       'West Bengal', 'Assam', 'Uttarakhand', 'Gujarat', 'Andhra Pradesh',
       'Tamil Nadu', 'Chhattisgarh', 'Jharkhand', 'Nagaland', 'Manipur',
       'Telangana', 'Tripura', 'Mizoram', 'Jammu and Kashmir',
       'Chandigarh', 'Sikkim', 'Odisha', 'Kerala',
       'The Dadra And Nagar Haveli And Daman And Diu',
       'Arunachal Pradesh', 'Himachal Pradesh', 'Goa',
       'Jammu And Kashmir', 'Dadra and Nagar Haveli and Daman and Diu',
       'Ladakh', 'Andaman and Nicobar Islands', 'Orissa', 'Pondicherry',
       'Puducherry', 'Lakshadweep', 'Andaman & Nicobar Islands',
       'Dadra & Nagar Haveli', 'Dadra and Nagar Haveli', 'Daman and Diu',
       'WEST BENGAL', 'Jammu & Kashmir', 'West  Bengal', 'Daman & Diu',
       'West Bangal', 'Westbengal', 'West bengal', 'andhra pradesh',
       'ODISHA', 'WESTBENGAL'], dtype=object)

In [18]:
enrolment_df = enrolment_df[
    enrolment_df['state'].astype(str) != '100000'
]

In [19]:
enrolment_df['date'] = pd.to_datetime(
    enrolment_df['date'],
    dayfirst=True,
    errors='coerce'
)

In [20]:
age_cols = ['age_0_5', 'age_5_17', 'age_18_greater']
enrolment_df[age_cols] = enrolment_df[age_cols].fillna(0)

In [21]:
enrolment_df = enrolment_df.dropna(subset=['date', 'state', 'district'])

In [22]:
enrolment_df['year'] = enrolment_df['date'].dt.year
enrolment_df['month'] = enrolment_df['date'].dt.month

enrolment_df['total_enrolment'] = (
    enrolment_df['age_0_5'] +
    enrolment_df['age_5_17'] +
    enrolment_df['age_18_greater']
)

In [35]:
def normalize_state(s):
    s = str(s)
    s = s.strip()                 # remove leading/trailing spaces
    s = s.lower()                 # lowercase
    s = s.replace('&', 'and')     # replace &
    s = ' '.join(s.split())       # remove extra spaces
    return s

enrolment_df['state_norm'] = enrolment_df['state'].apply(normalize_state)


In [36]:
enrolment_df[['state', 'state_norm']].drop_duplicates().head(15)

Unnamed: 0,state,state_norm
0,Meghalaya,meghalaya
1,Karnataka,karnataka
2,Uttar Pradesh,uttar pradesh
5,Bihar,bihar
12,Maharashtra,maharashtra
15,Haryana,haryana
19,Rajasthan,rajasthan
21,Punjab,punjab
24,Delhi,delhi
25,Madhya Pradesh,madhya pradesh


In [37]:
state_mapping = {
    # West Bengal
    'west bengal': 'West Bengal',
    'west bangal': 'West Bengal',
    'westbengal': 'West Bengal',
    'west bengal ': 'West Bengal',

    # Odisha
    'orissa': 'Odisha',
    'odisha': 'Odisha',

    # Andhra Pradesh
    'andhra pradesh': 'Andhra Pradesh',

    # Jammu & Kashmir
    'jammu and kashmir': 'Jammu and Kashmir',
    'jammu kashmir': 'Jammu and Kashmir',

    # Dadra & Nagar Haveli + Daman & Diu
    'dadra and nagar haveli and daman and diu':
        'Dadra and Nagar Haveli and Daman and Diu',
    'dadra & nagar haveli':
        'Dadra and Nagar Haveli and Daman and Diu',
    'daman and diu':
        'Dadra and Nagar Haveli and Daman and Diu',
    'daman & diu':
        'Dadra and Nagar Haveli and Daman and Diu',
    'The Dadra And Nagar Haveli And Daman And Diu':
        'Dadra and Nagar Haveli and Daman and Diu',

    # Puducherry
    'pondicherry': 'Puducherry',
    'puducherry': 'Puducherry',

    # Andaman & Nicobar
    'andaman and nicobar islands': 'Andaman and Nicobar Islands',
    'andaman & nicobar islands': 'Andaman and Nicobar Islands',
}

enrolment_df['state_clean'] = enrolment_df['state_norm'].replace(state_mapping)

In [38]:
enrolment_df['state_clean'] = enrolment_df['state_clean'].str.title()

In [39]:
enrolment_df = enrolment_df.drop(columns=['state_norm'])

In [40]:
sorted(enrolment_df['state_clean'].unique())

['Andaman And Nicobar Islands',
 'Andhra Pradesh',
 'Arunachal Pradesh',
 'Assam',
 'Bihar',
 'Chandigarh',
 'Chhattisgarh',
 'Dadra And Nagar Haveli',
 'Dadra And Nagar Haveli And Daman And Diu',
 'Delhi',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jammu And Kashmir',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Ladakh',
 'Lakshadweep',
 'Madhya Pradesh',
 'Maharashtra',
 'Manipur',
 'Meghalaya',
 'Mizoram',
 'Nagaland',
 'Odisha',
 'Puducherry',
 'Punjab',
 'Rajasthan',
 'Sikkim',
 'Tamil Nadu',
 'Telangana',
 'The Dadra And Nagar Haveli And Daman And Diu',
 'Tripura',
 'Uttar Pradesh',
 'Uttarakhand',
 'West Bengal']

In [41]:
enrolment_df['state_clean'].nunique()

38

In [42]:
enrolment_df['state'] = enrolment_df['state_clean']
enrolment_df = enrolment_df.drop(columns=['state_clean'])

In [43]:
enrolment_df['state'].unique() 

array(['Meghalaya', 'Karnataka', 'Uttar Pradesh', 'Bihar', 'Maharashtra',
       'Haryana', 'Rajasthan', 'Punjab', 'Delhi', 'Madhya Pradesh',
       'West Bengal', 'Assam', 'Uttarakhand', 'Gujarat', 'Andhra Pradesh',
       'Tamil Nadu', 'Chhattisgarh', 'Jharkhand', 'Nagaland', 'Manipur',
       'Telangana', 'Tripura', 'Mizoram', 'Jammu And Kashmir',
       'Chandigarh', 'Sikkim', 'Odisha', 'Kerala',
       'The Dadra And Nagar Haveli And Daman And Diu',
       'Arunachal Pradesh', 'Himachal Pradesh', 'Goa',
       'Dadra And Nagar Haveli And Daman And Diu', 'Ladakh',
       'Andaman And Nicobar Islands', 'Puducherry', 'Lakshadweep',
       'Dadra And Nagar Haveli'], dtype=object)

In [44]:
enrolment_df.to_csv("../cleaned_csv/enrolment_clean.csv", index=False)
biometric_df.to_csv("../cleaned_csv/biometric_clean.csv", index=False)
demographic_df.to_csv("../cleaned_csv/demographic_clean.csv", index=False)