# 04. Anomaly Detection - Preprocessing

This notebook prepares the data for anomaly detection. It calculates Z-scores and flags potential outliers in enrollment and update data.

In [None]:
import pandas as pd
import numpy as np
import sys
import os

sys.path.append(os.path.abspath('../../'))
from utils.data_loader import load_enrollment_data, load_demographic_data, load_biometric_data

print("Libraries imported successfully.")

## 1. Load and Merge Data

In [None]:
enr_df = load_enrollment_data('../../')
demo_df = load_demographic_data('../../')
bio_df = load_biometric_data('../../')

key_cols = ['state', 'district', 'pincode']
print("Aggregating data... (Optimized)")

enr_agg = enr_df.groupby(key_cols).sum(numeric_only=True).reset_index()
enr_agg['total_enrollments'] = enr_agg['age_0_5'] + enr_agg['age_5_17'] + enr_agg['age_18_greater']

demo_agg = demo_df.groupby(key_cols).sum(numeric_only=True).reset_index()
demo_agg['total_demo_updates'] = demo_agg['demo_age_5_17'] + demo_agg['demo_age_17_']

bio_agg = bio_df.groupby(key_cols).sum(numeric_only=True).reset_index()
bio_agg['total_bio_updates'] = bio_agg['bio_age_5_17'] + bio_agg['bio_age_17_']

anomaly_data = pd.merge(enr_agg, demo_agg, on=key_cols, how='outer')
anomaly_data = pd.merge(anomaly_data, bio_agg, on=key_cols, how='outer')
anomaly_data = anomaly_data.fillna(0)

print("Data merged for anomaly detection.")

## 2. Statistical Anomaly Flagging
We use Z-scores (calculated via pandas) to identify districts with unusually high enrollments or updates.

In [None]:
def calculate_zscore(series):
    if len(series) <= 1 or series.std() == 0:
        return 0
    return (series - series.mean()) / series.std()

print("Calculating Z-scores...")
# Calculating Z-scores for total enrollments within each state
anomaly_data['enr_z_score'] = anomaly_data.groupby('state')['total_enrollments'].transform(calculate_zscore)
anomaly_data['demo_z_score'] = anomaly_data.groupby('state')['total_demo_updates'].transform(calculate_zscore)

# Flagging anomalies where Z-score > 3 (3 standard deviations from mean)
anomaly_data['is_enr_anomaly'] = anomaly_data['enr_z_score'].abs() > 3
anomaly_data['is_demo_anomaly'] = anomaly_data['demo_z_score'].abs() > 3

print("Anomaly flags generated.")

## 3. Save Processed Data

In [None]:
os.makedirs('../../processed_data', exist_ok=True)
anomaly_data.to_csv('../../processed_data/anomaly_detection_data.csv', index=False)
print("Processed data saved to processed_data/anomaly_detection_data.csv")