<a href="https://colab.research.google.com/github/Hemamalini-L/UIDAI_HACKATHON/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================
# UIDAI Hackathon 2026 Analysis
# ===============================

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Settings
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10,5)


In [None]:
# Load datasets
df_enrol = pd.read_csv("Aadhar_Enrolment.csv.csv")
df_demo  = pd.read_csv("Aadhar_Demographic.csv.csv")
df_bio   = pd.read_csv("Aadhar_Biometric.csv.csv")

# Preview data
df_enrol.head()


In [None]:
# Rename columns for clarity
df_enrol.rename(columns={
    'age_0_5': 'Age_0_5',
    'age_5_17': 'Age_5_17',
    'age_18_greater': 'Age_18_plus'
}, inplace=True)

# Convert date
df_enrol['date'] = pd.to_datetime(df_enrol['date'], dayfirst=True)

# Create total enrolment
df_enrol['Total_Enrolment'] = (
    df_enrol['Age_0_5'] +
    df_enrol['Age_5_17'] +
    df_enrol['Age_18_plus']
)

df_enrol.head()


In [None]:
state_enrol = df_enrol.groupby('state')['Total_Enrolment'].sum().sort_values(ascending=False)

sns.barplot(
    x=state_enrol.head(10).index,
    y=state_enrol.head(10).values,
    palette="viridis"
)
plt.title("Top 10 States by Aadhaar Enrolment")
plt.xticks(rotation=45)
plt.show()


In [None]:
age_totals = df_enrol[['Age_0_5','Age_5_17','Age_18_plus']].sum()

plt.pie(
    age_totals,
    labels=age_totals.index,
    autopct='%1.1f%%',
    startangle=140
)
plt.title("Aadhaar Enrolment by Age Group")
plt.show()


In [None]:
df_enrol['month_year'] = df_enrol['date'].dt.to_period('M')
monthly_enrol = df_enrol.groupby('month_year')['Total_Enrolment'].sum()

monthly_enrol.plot(marker='o')
plt.title("Monthly Aadhaar Enrolment Trend")
plt.xlabel("Month-Year")
plt.ylabel("Total Enrolments")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Convert date
df_demo['date'] = pd.to_datetime(df_demo['date'], dayfirst=True)

# Create total updates
df_demo['Total_Updates'] = df_demo[['demo_age_5_17','demo_age_17_']].sum(axis=1)

state_demo = df_demo.groupby('state')['Total_Updates'].sum().sort_values(ascending=False)

sns.barplot(
    x=state_demo.head(10).index,
    y=state_demo.head(10).values,
    palette="magma"
)
plt.title("Top 10 States by Demographic Updates")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Convert date
df_bio['date'] = pd.to_datetime(df_bio['date'], dayfirst=True)

# Create total biometric updates
df_bio['Total_Biometric_Updates'] = df_bio[['bio_age_5_17','bio_age_17_']].sum(axis=1)

state_bio = df_bio.groupby('state')['Total_Biometric_Updates'].sum().sort_values(ascending=False)

sns.barplot(
    x=state_bio.head(10).index,
    y=state_bio.head(10).values,
    palette="coolwarm"
)
plt.title("Top 10 States by Biometric Updates")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Extract year
df_enrol['year'] = df_enrol['date'].dt.year

X = df_enrol[['year']]
y = df_enrol['Total_Enrolment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


In [None]:
future_years = pd.DataFrame({'year':[2026, 2027, 2028]})
future_years['Predicted_Enrolment'] = model.predict(future_years)
future_years


In [None]:
plt.scatter(X_test, y_test, label="Actual")
plt.scatter(X_test, y_pred, label="Predicted")
plt.xlabel("Year")
plt.ylabel("Total Enrolment")
plt.title("Actual vs Predicted Aadhaar Enrolment")
plt.legend()
plt.show()
