In [2]:
import pandas as pd
import plotly.express as px

In [4]:
df = pd.read_csv("/content/UIDAI_cleaned_demography_data.csv")
df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,date_formatted,state_clean
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529,20250301,Uttar Pradesh
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375,20250301,Andhra Pradesh
2,01-03-2025,Gujarat,Rajkot,360006,65,765,20250301,Gujarat
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314,20250301,Andhra Pradesh
4,01-03-2025,Rajasthan,Udaipur,313801,45,785,20250301,Rajasthan


# Univariate Analysis

In [35]:
# “This univariate histogram visualizes the distribution of total Aadhaar updates.
# The right-skewed shape highlights rare but extreme spike days,
# indicating deadline-driven or policy-induced compliance events rather than uniform daily activity.”


df['total_updates'] = df['demo_age_5_17'] + df['demo_age_17_']

# Plot a histogram to perform UNIVARIATE analysis
# This shows the distribution of total Aadhaar updates
# across all records (dates / locations)
fig = px.histogram(
    df,
    x='total_updates',          # Single variable → Univariate analysis
    nbins=60,                   # Number of bins to observe spread and spikes
    title='Distribution of Total Aadhaar Updates'
)

# Display the histogram
fig.show()


In [34]:
# “This univariate box plot highlights the typical daily Aadhaar update volume and reveals extreme outlier days.
#The presence of high-value outliers confirms the existence of rare,
# deadline-driven surge events that significantly exceed normal operational levels.

# Aggregate total Aadhaar updates at a daily level
# This collapses all state/district records into one value per date
daily = df.groupby('date', as_index=False)['total_updates'].sum()

# Create a box plot to perform UNIVARIATE analysis
# The box plot summarizes the distribution of daily Aadhaar update volumes
# and helps identify median behavior, variability, and extreme outliers
fig = px.box(
    daily,
    y='total_updates',          # Single variable → Univariate analysis
    title='Daily Aadhaar Update Volume Distribution'
)

# Display the plot
fig.show()


# Bivariate Analysis


In [25]:
# “This bivariate time-series plot shows the temporal evolution of Aadhaar demographic updates.
#It helps identify long-term trends, seasonal patterns,
#and sudden spikes that often correspond to policy deadlines or compliance-driven events.”


df['total_updates'] = df['demo_age_5_17'] + df['demo_age_17_']

# Aggregate the total updates at a daily level
# This converts granular (state/district) data into a time-series
daily_trend = df.groupby('date', as_index=False)['total_updates'].sum()

# Plot a line chart to perform BIVARIATE analysis
# This visualizes how Aadhaar update volumes change over time
fig = px.line(
    daily_trend,
    x='date',                   # Time dimension
    y='total_updates',          # Update volume
    title='Daily Aadhaar Demographic Updates Trend'
)

# Display the time-series plot
fig.show()


In [36]:
# “This bivariate time-series analysis compares Aadhaar update trends across age groups.
# It helps identify which age group predominantly drives update spikes,
# enabling distinction between policy-driven adult compliance and family or education-related child updates.”


# This combines all state/district-level records into
# daily totals for age groups 5–17 and 17+
age_trend = df.groupby('date', as_index=False)[
    ['demo_age_5_17', 'demo_age_17_']
].sum()

# Create a line chart to perform BIVARIATE analysis
# This visualizes how Aadhaar updates vary over time
# for different age groups
fig = px.line(
    age_trend,
    x='date',                         # Time dimension
    y=['demo_age_5_17', 'demo_age_17_'],  # Age-group-wise update counts
    title='Age-wise Aadhaar Update Trends',
    labels={
        'value': 'Number of Updates',
        'variable': 'Age Group'
    }
)

# Display the age-wise trend plot
fig.show()

In [10]:
df['date_formatted'].nunique()

95

# Trivariate Analysis

In [37]:
#“This trivariate analysis visualizes daily Aadhaar update volumes across districts over time.
# It highlights localized spike events, enabling identification of districts that experience unusually high update activity,
# often linked to migration, policy deadlines, or targeted outreach campaigns.”

# This computes daily update volumes for each district separately
district_trend = df.groupby(
    ['district', 'date'],
    as_index=False
)['total_updates'].sum()

# Create a multi-line chart to perform TRIVARIATE analysis
# Variables involved:
# 1) Date (time)
# 2) District (geographic category)
# 3) Total Aadhaar updates (volume)
# This visualization helps identify district-specific spikes
# and compare compliance behavior across districts
fig = px.line(
    district_trend,
    x='date',              # Time dimension
    y='total_updates',     # Update volume
    color='district',      # District-wise comparison
    title='District-level Aadhaar Update Spikes'
)

# Display the district-level trend plot
fig.show()



In [31]:
# “This trivariate time-series analysis compares adult Aadhaar update trends across states over time.
# It highlights differences in compliance timing and intensity, helping identify states with early, delayed,
# or consistently high update activity during policy-driven events.”


# This summarizes daily Aadhaar updates at the state level,
# separating updates for age group 5–17 and 17+
state_age_time = df.groupby(
    ['state_clean', 'date'],
    as_index=False
)[['demo_age_5_17', 'demo_age_17_']].sum()

# Create a multi-line time-series plot to perform TRIVARIATE analysis
# Variables involved:
# 1) Date (time dimension)
# 2) State (geographic dimension)
# 3) Adult Aadhaar update volume (age 17+)
# This visualization helps compare how different states
# respond over time to Aadhaar update requirements
fig = px.line(
    state_age_time,
    x='date',                 # Time axis
    y='demo_age_17_',         # Adult (17+) Aadhaar updates
    color='state_clean',      # State-wise comparison
    title='Adult Aadhaar Updates Over Time by State'
)

# Display the state-wise adult update trend plot
fig.show()


In [32]:
# “This trivariate heatmap visualizes total Aadhaar updates across states over time.
#Darker colors indicate higher update volumes, allowing identification of peak activity periods,
#policy-driven spikes, and regional patterns in citizen compliance.”

# This summarizes daily update volumes for each state
state_heatmap = df.groupby(
    ['state_clean', 'date'],
    as_index=False
)['total_updates'].sum()

# Create a density heatmap to perform TRIVARIATE analysis
# Variables involved:
# 1) Date (x-axis, time dimension)
# 2) State (y-axis, geographic dimension)
# 3) Total updates (z-axis, represented by color intensity)
fig = px.density_heatmap(
    state_heatmap,
    x='date',                 # Time axis
    y='state_clean',          # State-wise comparison
    z='total_updates',        # Update volume (color intensity)
    color_continuous_scale='Viridis',
    title='State-wise Aadhaar Update Intensity Over Time'
)

fig.show()
