# 04. Anomaly Detection - Analysis

This notebook visualizes anomalies and outliers in Aadhaar enrollment and update data to identify potential systemic issues or reporting errors.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

os.makedirs('../../visualizations', exist_ok=True)
print("Libraries imported.")

## 1. Load Processed Data

In [None]:
data = pd.read_csv('../../processed_data/anomaly_detection_data.csv')
print("Data loaded.")

## 2. Visualizing Enrollment Outliers (Box Plot)

In [None]:
sns.boxplot(x='state', y='total_enrollments', data=data)
plt.xticks(rotation=90)
plt.title('Distribution of Enrollments by State (Identifying Outlier Districts)')
plt.savefig('../../visualizations/04_enrollment_outliers_by_state.png')
plt.show()

## 3. High Z-Score Anomalies
Listing districts that are more than 3 standard deviations away from the state mean.

In [None]:
anomalies = data[data['is_enr_anomaly'] | data['is_demo_anomaly']]
print(f"Found {len(anomalies)} anomalous records.")
if not anomalies.empty:
    print(anomalies[['state', 'district', 'pincode', 'total_enrollments', 'enr_z_score']].head(10))

## 4. Anomaly Map (Heatmap representation)

In [None]:
plt.scatter(data['enr_z_score'], data['demo_z_score'], c=data['is_enr_anomaly'], cmap='coolwarm', alpha=0.5)
plt.xlabel('Enrollment Z-Score')
plt.ylabel('Demographic Update Z-Score')
plt.title('Anomaly Clustering: Enrollment vs Demographic Updates')
plt.savefig('../../visualizations/04_anomaly_clustering_scatter.png')
plt.show()