# 03. Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import plotly.express as px

# Load Processed Data
processed_path = os.path.join("..", "data", "processed", "merged_master_table.csv")
df = pd.read_csv(processed_path)
df['date'] = pd.to_datetime(df['date'])
print("Data Loaded:", df.shape)

# Summary Stats
display(df.describe())

### 1. Correlation Heatmap
Let's see how Enrolments correlate with Updates. Does high `age_0_5` enrolment lead to high `bio_age_5_17` updates?

In [None]:
corr_cols = [
    'age_0_5', 'age_5_17', 'age_18_greater', 
    'bio_age_5_17', 'bio_age_17_', 
    'demo_age_5_17', 'demo_age_17_'
]
plt.figure(figsize=(10, 8))
sns.heatmap(df[corr_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Enrollment vs Updates Correlation")
plt.show()

### 2. State-wise Activity
Which states have the highest update intensity?

In [None]:
state_summary = df.groupby('state')[['total_enrolment', 'total_updates']].sum().reset_index()
fig = px.bar(state_summary, x='state', y=['total_enrolment', 'total_updates'], barmode='group', title="State-wise Enrolment vs Updates")
fig.show()

### 3. Distribution Analysis
Understanding the spread of daily updates. Is the workload evenly distributed or highly skewed?
Skewed data often indicates operational bottlenecks or specific "hotspot" days.

In [None]:
# Distribution of Daily Total Updates
plt.figure(figsize=(10, 6))
sns.histplot(df['total_updates'], bins=50, kde=True, color='purple')
plt.title("Distribution of Daily Total Updates (Workload Skew)")
plt.xlabel("Number of Updates")
plt.ylabel("Frequency")
plt.show()

# Boxplot to see outliers in Enrolment vs Updates
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[['total_enrolment', 'total_updates']])
plt.title("Outlier Detection: Enrolment vs Updates")
plt.show()

### 4. District-Level Hotspots
Identify the Top 10 Districts contributing to the highest volume of updates. These are likely urban centers or migration hubs.

In [None]:
# Top 10 Districts by Total Updates
top_districts = df.groupby(['state', 'district'])['total_updates'].sum().sort_values(ascending=False).head(10).reset_index()
top_districts['location'] = top_districts['district'] + ", " + top_districts['state']

fig = px.bar(
    top_districts, 
    x='total_updates', 
    y='location', 
    orientation='h', 
    title="Top 10 Busiest Districts (Total Updates)",
    color='total_updates',
    color_continuous_scale='Magma'
)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

### 5. Update Composition Analysis
What type of updates dominate the ecosystem? Biometric (Age 5/15) or Demographic (Address/Name)?
This helps in allocating the right type of kits (Bio-Metric machines vs Document Scanners).

In [None]:
# Melt data for composition plot
composition = df[['total_bio_updates', 'total_demo_updates']].sum().reset_index()
composition.columns = ['Update Type', 'Count']

fig = px.pie(
    composition, 
    values='Count', 
    names='Update Type', 
    title='Overall Composition: Biometric vs Demographic Updates',
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.show()

# State-wise composition breakdown
state_comp = df.groupby('state')[['total_bio_updates', 'total_demo_updates']].sum().reset_index()
fig = px.bar(
    state_comp, 
    x='state', 
    y=['total_bio_updates', 'total_demo_updates'], 
    title="State-wise Update Type Breakdown",
    barmode='stack',
    labels={'value': 'Count', 'variable': 'Update Type'}
)
fig.show()

### 6. Age Demographics: 0-5 Enrolment Trends
Analyzing the foundation of the Aadhaar ecosystem. High 0-5 enrolments today predict high Biometric Update load in 5 years.

In [None]:
fig = px.histogram(
    df, 
    x='age_0_5', 
    nbins=30, 
    title="Distribution of Age 0-5 Enrolments (Daily)",
    color_discrete_sequence=['green']
)
fig.add_vline(x=df['age_0_5'].mean(), line_dash="dash", line_color="red", annotation_text="Mean")
fig.show()

# Correlation Scatter: 0-5 Enrolment vs Total Updates (Is there an immediate link?)
fig = px.scatter(
    df, 
    x='age_0_5', 
    y='total_updates', 
    color='state', 
    title="Scatter: New Child Enrolments vs Total System Load",
    hover_data=['district']
)
fig.show()