# Aadhaar Fraud Detection - YOLOv8 Training

This notebook trains a YOLOv8 model for detecting fraud indicators in Aadhaar documents:
- Document tampering
- Photo overlays
- Tampered QR codes
- Altered fonts
- Subtle modifications

## Prerequisites
1. Upload your dataset to Google Drive
2. Enable GPU runtime (Runtime → Change runtime type → GPU)

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install ultralytics -q
!pip install roboflow -q

# Verify GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## 2. Configure Dataset Path

Update the path below to match your Google Drive folder structure.

In [None]:
# ============================================
# UPDATE THIS PATH TO YOUR DATASET LOCATION
# ============================================
DATASET_PATH = "/content/drive/MyDrive/AadhaarAuth_Dataset"
# ============================================

import os

# Verify dataset exists
if os.path.exists(DATASET_PATH):
    print("Dataset found!")
    print(f"Train images: {len(os.listdir(os.path.join(DATASET_PATH, 'train/images')))}")
    print(f"Valid images: {len(os.listdir(os.path.join(DATASET_PATH, 'valid/images')))}")
    if os.path.exists(os.path.join(DATASET_PATH, 'test/images')):
        print(f"Test images: {len(os.listdir(os.path.join(DATASET_PATH, 'test/images')))}")
else:
    print(f"ERROR: Dataset not found at {DATASET_PATH}")
    print("Please update the DATASET_PATH variable above")

In [None]:
# Create data.yaml configuration
data_yaml_content = f"""# Aadhaar Fraud Detection Dataset
path: {DATASET_PATH}
train: train/images
val: valid/images
test: test/images

# Number of classes (UPDATE THIS based on your dataset)
nc: 5

# Class names (UPDATE THIS based on your labels)
names:
  0: aadhaar_number
  1: photo
  2: qr_code
  3: name_field
  4: address_field
"""

# Save data.yaml
with open('/content/data.yaml', 'w') as f:
    f.write(data_yaml_content)

print("Created data.yaml:")
print(data_yaml_content)

## 3. Explore Dataset

In [None]:
import matplotlib.pyplot as plt
import cv2
import numpy as np
from PIL import Image
import random

# Get sample images
train_images_path = os.path.join(DATASET_PATH, 'train/images')
sample_images = random.sample(os.listdir(train_images_path), min(4, len(os.listdir(train_images_path))))

# Display samples
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for idx, img_name in enumerate(sample_images):
    img_path = os.path.join(train_images_path, img_name)
    img = Image.open(img_path)
    ax = axes[idx // 2, idx % 2]
    ax.imshow(img)
    ax.set_title(img_name[:30] + '...' if len(img_name) > 30 else img_name)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
# Analyze label distribution
from collections import Counter

train_labels_path = os.path.join(DATASET_PATH, 'train/labels')
class_counts = Counter()

for label_file in os.listdir(train_labels_path):
    with open(os.path.join(train_labels_path, label_file), 'r') as f:
        for line in f:
            class_id = int(line.strip().split()[0])
            class_counts[class_id] += 1

print("Class Distribution:")
for class_id, count in sorted(class_counts.items()):
    print(f"  Class {class_id}: {count} annotations")

# Plot distribution
plt.figure(figsize=(10, 5))
plt.bar(class_counts.keys(), class_counts.values())
plt.xlabel('Class ID')
plt.ylabel('Number of Annotations')
plt.title('Class Distribution in Training Set')
plt.show()

## 4. Train YOLOv8 Model

In [None]:
from ultralytics import YOLO

# Load YOLOv8 model (use 'n' for nano, 's' for small, 'm' for medium, 'l' for large)
# Start with 'n' or 's' for faster training, use 'm' or 'l' for better accuracy
model = YOLO('yolov8m.pt')  # Medium model - good balance

print("Model loaded successfully!")

In [None]:
# Training configuration
EPOCHS = 100          # Number of training epochs
BATCH_SIZE = 16       # Batch size (reduce if OOM error)
IMG_SIZE = 640        # Image size
PATIENCE = 20         # Early stopping patience

# Output directory in Drive (to persist results)
OUTPUT_DIR = "/content/drive/MyDrive/AadhaarAuth_Models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Training Configuration:")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Image Size: {IMG_SIZE}")
print(f"  Output: {OUTPUT_DIR}")

In [None]:
# Start training
results = model.train(
    data='/content/data.yaml',
    epochs=EPOCHS,
    batch=BATCH_SIZE,
    imgsz=IMG_SIZE,
    patience=PATIENCE,
    project=OUTPUT_DIR,
    name='aadhaar_fraud_detector',
    exist_ok=True,
    pretrained=True,
    optimizer='AdamW',
    lr0=0.001,
    lrf=0.01,
    momentum=0.937,
    weight_decay=0.0005,
    warmup_epochs=3,
    warmup_momentum=0.8,
    box=7.5,
    cls=0.5,
    dfl=1.5,
    augment=True,
    cache=True,
    device=0,
    workers=2,
    verbose=True
)

## 5. Evaluate Model

In [None]:
# Load best model
best_model_path = f"{OUTPUT_DIR}/aadhaar_fraud_detector/weights/best.pt"
best_model = YOLO(best_model_path)

# Validate on validation set
val_results = best_model.val(data='/content/data.yaml')

print("\nValidation Results:")
print(f"  mAP50: {val_results.box.map50:.4f}")
print(f"  mAP50-95: {val_results.box.map:.4f}")
print(f"  Precision: {val_results.box.mp:.4f}")
print(f"  Recall: {val_results.box.mr:.4f}")

In [None]:
# Test on sample images
test_images_path = os.path.join(DATASET_PATH, 'valid/images')
test_samples = random.sample(os.listdir(test_images_path), min(6, len(os.listdir(test_images_path))))

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for idx, img_name in enumerate(test_samples):
    img_path = os.path.join(test_images_path, img_name)
    
    # Run inference
    results = best_model.predict(img_path, conf=0.25, verbose=False)
    
    # Plot result
    ax = axes[idx // 3, idx % 3]
    result_img = results[0].plot()
    ax.imshow(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB))
    ax.set_title(f"{len(results[0].boxes)} detections")
    ax.axis('off')

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/sample_predictions.png")
plt.show()

## 6. Export Model

In [None]:
# Export to different formats
print("Exporting model...")

# Export to ONNX (for cross-platform deployment)
best_model.export(format='onnx', imgsz=640, simplify=True)

# Export to TorchScript (for PyTorch deployment)
best_model.export(format='torchscript', imgsz=640)

print("\nExported models:")
print(f"  PyTorch: {best_model_path}")
print(f"  ONNX: {best_model_path.replace('.pt', '.onnx')}")
print(f"  TorchScript: {best_model_path.replace('.pt', '.torchscript')}")

In [None]:
# Copy best model to easy download location
import shutil

final_model_path = f"{OUTPUT_DIR}/aadhaar_fraud_detector_best.pt"
shutil.copy(best_model_path, final_model_path)

print(f"\n✅ Training Complete!")
print(f"\nBest model saved to: {final_model_path}")
print(f"\nNext steps:")
print(f"1. Download the model from Google Drive")
print(f"2. Place it in your project: backend/models/aadhaar_fraud_detector.pt")
print(f"3. The backend will automatically use it for fraud detection")

## 7. Quick Test (Optional)

In [None]:
# Upload and test your own image
from google.colab import files

print("Upload an Aadhaar image to test:")
uploaded = files.upload()

for filename in uploaded.keys():
    # Run inference
    results = best_model.predict(filename, conf=0.25)
    
    # Display result
    plt.figure(figsize=(12, 8))
    result_img = results[0].plot()
    plt.imshow(cv2.cvtColor(result_img, cv2.COLOR_BGR2RGB))
    plt.title(f"Detections: {len(results[0].boxes)}")
    plt.axis('off')
    plt.show()
    
    # Print detection details
    print("\nDetection Details:")
    for box in results[0].boxes:
        cls_id = int(box.cls[0])
        conf = float(box.conf[0])
        cls_name = results[0].names[cls_id]
        print(f"  - {cls_name}: {conf:.2%} confidence")