# 05. Predictive Analytics - Analysis

This notebook uses simple predictive models to forecast future Aadhaar service demand based on current regional patterns.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
import os

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

os.makedirs('../../visualizations', exist_ok=True)
print("Libraries imported.")

## 1. Load Processed Data

In [None]:
data = pd.read_csv('../../processed_data/predictive_data.csv')
print("Data loaded.")

## 2. Predicting Updates from Enrollments
A simple regression to see if we can predict future update demand based on historical enrollment volume.

In [None]:
X = data[['total_enrollments']].values
y = (data['total_demo_updates'] + data['total_bio_updates']).values

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

plt.scatter(X, y, color='blue', alpha=0.3, label='Actual Data')
plt.plot(X, y_pred, color='red', linewidth=2, label='Regression Line')
plt.title('Predicting Update Demand from Enrollment Volume')
plt.xlabel('Total Enrollments')
plt.ylabel('Total Updates')
plt.legend()
plt.savefig('../../visualizations/05_update_demand_prediction_regression.png')
plt.show()

print(f"Model R^2 score: {model.score(X, y):.4f}")

## 3. High Prediction Residuals
Identifying districts that deviate significantly from the expected update pattern.

In [None]:
data['predicted_updates'] = y_pred
data['residual'] = data['total_demo_updates'] + data['total_bio_updates'] - data['predicted_updates']

top_deviations = data.sort_values(by='residual', ascending=False).head(10)
print("Top 10 Districts with Higher-than-Expected Update Demand:")
print(top_deviations[['state', 'district', 'total_enrollments', 'residual']])