# 01. Data Understanding & Profiling

In [2]:
import sys
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Add the project root to the system path to allow importing from src
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.preprocessing.data_loader import load_all_datasets

# Define the data directory
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'raw'))
print(f"Loading data from: {data_dir}")

# Load all datasets
datasets = load_all_datasets(data_dir)
print("Datasets loaded:", datasets.keys())

Loading data from: c:\Users\Kaustab das\Desktop\Aadhaar Pulse AI\data\raw
Datasets loaded: dict_keys(['enrolment', 'biometric', 'demographic'])


## 1. Enrolment Data Profiling
Let's examine the structure, missing values, and basic statistics of the **Enrolment** dataset.

In [3]:
df_enrol = datasets['enrolment']

if df_enrol is not None:
    print("Shape:", df_enrol.shape)
    display(df_enrol.head())
    print("\n--- Info ---")
    df_enrol.info()
    print("\n--- Missing Values ---")
    print(df_enrol.isnull().sum())
    print("\n--- Descriptive Statistics ---")
    display(df_enrol.describe())
else:
    print("Enrolment dataset not found.")

Shape: (1006029, 7)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37
1,2025-03-09,Karnataka,Bengaluru Urban,560043,14,33,39
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,2025-03-09,Uttar Pradesh,Aligarh,202133,62,29,15
4,2025-03-09,Karnataka,Bengaluru Urban,560016,14,16,21



--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   date            1006029 non-null  datetime64[ns]
 1   state           1006029 non-null  object        
 2   district        1006029 non-null  object        
 3   pincode         1006029 non-null  int64         
 4   age_0_5         1006029 non-null  int64         
 5   age_5_17        1006029 non-null  int64         
 6   age_18_greater  1006029 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 53.7+ MB

--- Missing Values ---
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

--- Descriptive Statistics ---


Unnamed: 0,date,pincode,age_0_5,age_5_17,age_18_greater
count,1006029,1006029.0,1006029.0,1006029.0,1006029.0
mean,2025-10-23 18:26:44.387150336,518641.5,3.525709,1.710074,0.1673441
min,2025-03-02 00:00:00,100000.0,0.0,0.0,0.0
25%,2025-09-19 00:00:00,363641.0,1.0,0.0,0.0
50%,2025-10-27 00:00:00,517417.0,2.0,0.0,0.0
75%,2025-11-15 00:00:00,700104.0,3.0,1.0,0.0
max,2025-12-31 00:00:00,855456.0,2688.0,1812.0,855.0
std,,205636.0,17.53851,14.36963,3.220525


## 2. Biometric Update Data Profiling
Now let's look at the **Biometric Update** dataset. This tracks updates to photos, fingerprints, and iris scans.

In [4]:
df_bio = datasets['biometric']

if df_bio is not None:
    print("Shape:", df_bio.shape)
    display(df_bio.head())
    print("\n--- Info ---")
    df_bio.info()
    print("\n--- Missing Values ---")
    print(df_bio.isnull().sum())
    print("\n--- Descriptive Statistics ---")
    display(df_bio.describe())
else:
    print("Biometric dataset not found.")

Shape: (1861108, 6)


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,2025-03-01,Haryana,Mahendragarh,123029,280,577
1,2025-03-01,Bihar,Madhepura,852121,144,369
2,2025-03-01,Jammu and Kashmir,Punch,185101,643,1091
3,2025-03-01,Bihar,Bhojpur,802158,256,980
4,2025-03-01,Tamil Nadu,Madurai,625514,271,815



--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   state         object        
 2   district      object        
 3   pincode       int64         
 4   bio_age_5_17  int64         
 5   bio_age_17_   int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 85.2+ MB

--- Missing Values ---
date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64

--- Descriptive Statistics ---


Unnamed: 0,date,pincode,bio_age_5_17,bio_age_17_
count,1861108,1861108.0,1861108.0,1861108.0
mean,2025-10-22 22:32:04.288541952,521761.2,18.39058,19.09413
min,2025-03-01 00:00:00,110001.0,0.0,0.0
25%,2025-09-18 00:00:00,391175.0,1.0,1.0
50%,2025-11-04 00:00:00,522401.0,3.0,4.0
75%,2025-12-04 00:00:00,686636.2,11.0,10.0
max,2025-12-29 00:00:00,855456.0,8002.0,7625.0
std,,198162.7,83.70421,88.06502


## 3. Demographic Update Data Profiling
Finally, the **Demographic Update** dataset, which tracks changes to name, address, DOB, etc. This is crucial for migration analysis.

In [5]:
df_demo = datasets['demographic']

if df_demo is not None:
    print("Shape:", df_demo.shape)
    display(df_demo.head())
    print("\n--- Info ---")
    df_demo.info()
    print("\n--- Missing Values ---")
    print(df_demo.isnull().sum())
    print("\n--- Descriptive Statistics ---")
    display(df_demo.describe())
else:
    print("Demographic dataset not found.")

Shape: (2071700, 6)


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,2025-03-01,Uttar Pradesh,Gorakhpur,273213,49,529
1,2025-03-01,Andhra Pradesh,Chittoor,517132,22,375
2,2025-03-01,Gujarat,Rajkot,360006,65,765
3,2025-03-01,Andhra Pradesh,Srikakulam,532484,24,314
4,2025-03-01,Rajasthan,Udaipur,313801,45,785



--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 6 columns):
 #   Column         Dtype         
---  ------         -----         
 0   date           datetime64[ns]
 1   state          object        
 2   district       object        
 3   pincode        int64         
 4   demo_age_5_17  int64         
 5   demo_age_17_   int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 94.8+ MB

--- Missing Values ---
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

--- Descriptive Statistics ---


Unnamed: 0,date,pincode,demo_age_5_17,demo_age_17_
count,2071700,2071700.0,2071700.0,2071700.0
mean,2025-11-02 19:25:06.470627840,527831.8,2.347552,21.44701
min,2025-03-01 00:00:00,100000.0,0.0,0.0
25%,2025-10-15 00:00:00,396469.0,0.0,2.0
50%,2025-11-08 00:00:00,524322.0,1.0,6.0
75%,2025-12-06 00:00:00,695507.0,2.0,15.0
max,2025-12-29 00:00:00,855456.0,2690.0,16166.0
std,,197293.3,14.90355,125.2498


## 4. Initial Observations
*   **Granularity:** Are the datasets consistent in terms of State/District names?
*   **Date Ranges:** We need to check if all three datasets cover the same time period.
*   **Missing Data:** Identify if specific columns like 'pincode' have high missing rates.

In [6]:
# Check Date Ranges
for name, df in datasets.items():
    if df is not None and 'date' in df.columns:
        print(f"--- {name.capitalize()} Date Range ---")
        print(f"Min Date: {df['date'].min()}")
        print(f"Max Date: {df['date'].max()}")
        print(f"Total Days: {(df['date'].max() - df['date'].min()).days}")
        print("-" * 30)

--- Enrolment Date Range ---
Min Date: 2025-03-02 00:00:00
Max Date: 2025-12-31 00:00:00
Total Days: 304
------------------------------
--- Biometric Date Range ---
Min Date: 2025-03-01 00:00:00
Max Date: 2025-12-29 00:00:00
Total Days: 303
------------------------------
--- Demographic Date Range ---
Min Date: 2025-03-01 00:00:00
Max Date: 2025-12-29 00:00:00
Total Days: 303
------------------------------
