# 05. Anomaly Detection

In [1]:
import sys
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Add project root to sys.path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Project Root: {project_root}")


Project Root: c:\Users\Kaustab das\Desktop\Aadhaar Pulse AI


In [2]:
# Load Processed Data
processed_path = os.path.join(project_root, 'data', 'processed', 'merged_master_table.csv')
if os.path.exists(processed_path):
    df = pd.read_csv(processed_path)
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
    print(f"Data Loaded: {df.shape}")
else:
    print("Error: Processed data not found. Please run run_pipeline.py first.")


Data Loaded: (2947681, 15)


## 5.1 Running Isolation Forest

In [3]:
from src.models.anomaly_detection import detect_anomalies, specific_fraud_rules

# Detect
df_anom = detect_anomalies(df, contamination=0.01)
df_anom = specific_fraud_rules(df_anom)

anomalies = df_anom[df_anom['is_anomaly'] == True]
print(f"Total Anomalies Detected: {len(anomalies)}")

display(anomalies[['date', 'state', 'district', 'total_updates', 'anomaly_score']].head(10))


Total Anomalies Detected: 29473


Unnamed: 0,date,state,district,total_updates,anomaly_score
0,2025-03-02,Meghalaya,East Khasi Hills,0.0,-1
1,2025-03-09,Karnataka,Bengaluru Urban,0.0,-1
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,0.0,-1
3,2025-03-09,Uttar Pradesh,Aligarh,0.0,-1
5,2025-03-09,Bihar,Sitamarhi,0.0,-1
6,2025-03-09,Bihar,Sitamarhi,0.0,-1
7,2025-03-09,Uttar Pradesh,Bahraich,0.0,-1
9,2025-03-09,Bihar,Purbi Champaran,0.0,-1
10,2025-03-09,Uttar Pradesh,Maharajganj,0.0,-1
11,2025-03-09,Bihar,Sitamarhi,0.0,-1


## 5.2 Visualizing Anomalies

In [4]:
import plotly.express as px

# Subsample for plotting if too large
plot_data = df_anom.sample(n=min(5000, len(df_anom)), random_state=42)

fig = px.scatter(plot_data, x="total_enrolment", y="total_updates", color="is_anomaly", 
                 title="Anomaly Detection: Enrolment vs Updates",
                 color_discrete_map={True: 'red', False: 'blue'})
fig.show()
