<a href="https://colab.research.google.com/github/JayaShreeGit/CS579/blob/master/Copy_of_train_imagenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Randomized Smoothing Training - ImageNet Edition

This notebook trains image classifiers with randomized smoothing for provable adversarial robustness on **ImageNet** dataset.

**Note:** CIFAR-100 code is preserved as comments. You can switch between datasets by commenting/uncommenting the appropriate sections.

**Setup Instructions:**
1. Ensure ImageNet dataset is available in the specified directory (or uncomment CIFAR-100 sections)
2. Run all cells in order

3. Training will run on GPU (recommended)4. ImageNet requires significant storage (~150GB) and compute resources

## Step 0: Check GPU and Install Dependencies

In [None]:
# Check GPU availability
!nvidia-smi

# Install required packages
!pip install -q torch torchvision
!pip install -q kaggle
!pip install -q pandas matplotlib

import torch
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Step 1: Setup ImageNet Dataset Path

Configure the path to your ImageNet dataset. The dataset should be organized in train/val folders.

In [None]:
# Configure Kaggle API credentials using uploaded kaggle.json token file
import os
import json

# Option 1: Upload kaggle.json file (Recommended)
# Click the file upload icon in Colab and upload your kaggle.json file
from google.colab import files

print("Please upload your kaggle.json file...")
uploaded = files.upload()

if 'kaggle.json' in uploaded:
    # Create .kaggle directory
    os.makedirs('/root/.kaggle', exist_ok=True)

    # Save the uploaded file
    with open('/root/.kaggle/kaggle.json', 'w') as f:
        f.write(uploaded['kaggle.json'].decode('utf-8'))

    # Set proper permissions
    os.chmod('/root/.kaggle/kaggle.json', 0o600)

    # Load credentials into environment
    with open('/root/.kaggle/kaggle.json', 'r') as f:
        kaggle_creds = json.load(f)
        os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
        os.environ['KAGGLE_KEY'] = kaggle_creds['key']

    print("‚úì Kaggle credentials configured successfully!")
else:
    print("‚ö†Ô∏è No kaggle.json file uploaded. Please upload it to continue.")

# Option 2: Manual entry (Commented out - Uncomment if you prefer manual entry)
# from google.colab import userdata
# os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
# os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')
# print("‚úì Kaggle credentials configured from Colab Secrets")

# Download ImageNet dataset from Kaggle
print("\nDownloading ImageNet dataset from Kaggle...")
print("Dataset: aryankaushik005/imagenet")
print("This may take some time depending on dataset size...")

!kaggle datasets download -d aryankaushik005/imagenet
!mkdir -p /content/imagenet
!unzip -q imagenet.zip -d /content/imagenet

print("‚úì Dataset downloaded and extracted")

# Set ImageNet path - check multiple possible structures
imagenet_path = '/content/imagenet'

# Check for different possible directory structures
possible_paths = [
    '/content/imagenet/train',
    '/content/imagenet/ILSVRC/Data/CLS-LOC',
    '/content/imagenet/imagenet/train',
    '/content/imagenet'
]

# Find the correct path
for path in possible_paths:
    if os.path.exists(os.path.join(path, 'train')) and os.path.exists(os.path.join(path, 'val')):
        imagenet_path = path
        break
    elif path == '/content/imagenet' and os.path.exists(path):
        # Check if train/val are directly in /content/imagenet
        if os.path.exists(os.path.join(path, 'train')):
            imagenet_path = path
            break

print(f"\nImageNet dataset path: {imagenet_path}")
print(f"Train path: {os.path.join(imagenet_path, 'train')}")
print(f"Val path: {os.path.join(imagenet_path, 'val')}")

if os.path.exists(os.path.join(imagenet_path, 'train')):
    print("‚úì ImageNet path verified")
else:
    print("‚ö†Ô∏è Warning: ImageNet structure not found. Checking directory contents...")
    print("\nDirectory structure:")
    !ls -la /content/imagenet


# ===== Alternative Options (Commented Out) =====

# Option 2: Use ImageNet-Mini (1000 classes, ~25GB)
# !kaggle datasets download -d ifigotin/imagenetmini-1000
# !mkdir -p /content/imagenet
# !unzip -q imagenetmini-1000.zip -d /content/imagenet
# imagenet_path = '/content/imagenet/imagenet-mini'

# Option 3: Use Tiny-ImageNet (200 classes, 64x64 images, ~500MB)
# !wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
# !unzip -q tiny-imagenet-200.zip -d /content/
# imagenet_path = '/content/tiny-imagenet-200'

# Option 4: Use CIFAR-100 (100 classes, 32x32, ~600MB, auto-download)
# See commented sections below for CIFAR-100 configuration

# Option 5: Local Machine Path
# imagenet_path = r'c:\Users\jayas\Documents\PhD\random_smoothing\datasets\imagenet'

# ===== CIFAR-100 Configuration (Commented Out) =====
# print("‚úì CIFAR-100 will be downloaded automatically using torchvision")
# # CIFAR-100 will be downloaded automatically - no path needed

In [None]:
# ===== ImageNet Verification (Active) =====
print("Verifying ImageNet dataset structure...")

train_path = os.path.join(imagenet_path, 'train')
val_path = os.path.join(imagenet_path, 'val')

if os.path.exists(train_path):
    train_dirs = [d for d in os.listdir(train_path) if os.path.isdir(os.path.join(train_path, d))]
    print(f"‚úì Train directory found with {len(train_dirs)} class folders")
else:
    print("‚ö†Ô∏è Train directory not found")

if os.path.exists(val_path):
    val_dirs = [d for d in os.listdir(val_path) if os.path.isdir(os.path.join(val_path, d))]
    print(f"‚úì Val directory found with {len(val_dirs)} class folders")
else:
    print("‚ö†Ô∏è Val directory not found")

print("\nNote: ImageNet should have 1000 class folders in both train and val directories")

# print("CIFAR-100 will be downloaded automatically in the next cell")

# ===== CIFAR-100 Verification (Commented Out) =====# # No verification needed - dataset downloads automatically

## Step 2: Clone Repository and Setup Environment

In [None]:
# Clone the randomized smoothing repository
!git clone https://github.com/JayaShreeGit/random_smoothing.git /content/random_smoothing

# Navigate to code directory
import os
os.chdir('/content/random_smoothing/code')

print(f"‚úì Repository cloned")
print(f"Current directory: {os.getcwd()}")
print(f"\nCode files:")
!ls -lh *.py

In [None]:
# Create output directory for models
import os

# ===== ImageNet Output Directory (Active) =====
outdir = r'c:\Users\jayas\Documents\PhD\random_smoothing\models\imagenet\resnet18\noise_0.25'

# ===== CIFAR-100 Output Directory (Commented Out) =====

# outdir = r'c:\Users\jayas\Documents\PhD\random_smoothing\models\cifar100\resnet\noise_0.25'print(f"‚úì Model directory created: {outdir}")

os.makedirs(outdir, exist_ok=True)

## Step 3: Load ImageNet Dataset

Load the ImageNet dataset using PyTorch's ImageFolder. ImageNet images are 224√ó224 with 1000 classes.

In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
# from torchvision.datasets import CIFAR100  # Uncomment for CIFAR-100

# ===== ImageNet Dataset Loading (Active) =====
print("Loading ImageNet dataset...")
print("This may take a moment due to the large number of images.")

# ImageNet normalization values
imagenet_mean = (0.485, 0.456, 0.406)
imagenet_std = (0.229, 0.224, 0.225)

# Define transforms for training data (with augmentation)
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(imagenet_mean, imagenet_std)
])

# Define transforms for validation data (no augmentation)
val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(imagenet_mean, imagenet_std)
])

# Load datasets using ImageFolder
train_dataset = datasets.ImageFolder(
    root=os.path.join(imagenet_path, 'train'),
    transform=train_transform
)

test_dataset = datasets.ImageFolder(
    root=os.path.join(imagenet_path, 'val'),
    transform=val_transform
)

print(f"\n‚úì ImageNet dataset loaded successfully!")
print(f"Training samples: {len(train_dataset):,}")
print(f"Validation samples: {len(test_dataset):,}")

print(f"Number of classes: {len(train_dataset.classes)}")# print(f"Image size: 32√ó32√ó3")

print(f"Image size: 224√ó224√ó3")# print(f"Number of classes: 100")

# print(f"Test samples: {len(test_dataset):,}")

# ===== CIFAR-100 Dataset Loading (Commented Out) =====# print(f"Training samples: {len(train_dataset):,}")

# print("Loading CIFAR-100 dataset...")# print(f"\n‚úì CIFAR-100 dataset loaded successfully!")

# print("This will automatically download the dataset if not already present.")#

# #                        download=True, transform=test_transform)

# # CIFAR-100 normalization values# test_dataset = CIFAR100(root='./data/cifar100', train=False,

# cifar100_mean = (0.5071, 0.4867, 0.4408)#                         download=True, transform=train_transform)

# cifar100_std = (0.2675, 0.2565, 0.2761)# train_dataset = CIFAR100(root='./data/cifar100', train=True,

# # # Download and create datasets

# # Define transforms for training data (with augmentation)#

# train_transform = transforms.Compose([# ])

#     transforms.RandomCrop(32, padding=4),#     transforms.Normalize(cifar100_mean, cifar100_std)

#     transforms.RandomHorizontalFlip(),#     transforms.ToTensor(),

#     transforms.ToTensor(),# test_transform = transforms.Compose([

#     transforms.Normalize(cifar100_mean, cifar100_std)# # Define transforms for test data (no augmentation)

# ])#

## Step 4: Import Training Libraries

In [None]:
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import SGD, Optimizer
from torch.optim.lr_scheduler import StepLR
from torchvision import models, transforms
import time
import datetime

print(f"‚úì Libraries imported")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Step 5: Configure Training Parameters

Configure hyperparameters for ImageNet with randomized smoothing.

In [None]:
# ===== ImageNet Configuration (Active) =====
class Config:
    def __init__(self):
        # Dataset parameters
        self.dataset = 'imagenet'
        self.num_classes = 1000
        self.image_size = 224

        # Model architecture (using ResNet-18 for ImageNet)
        self.arch = 'resnet18'

        # Output directory
        self.outdir = outdir

        # Training hyperparameters (standard ImageNet settings)
        self.batch = 256  # Adjust based on GPU memory (use 128 or 64 if OOM)
        self.epochs = 50
        self.lr = 0.1
        self.lr_step_size = 20
        self.gamma = 0.1
        self.momentum = 0.9
        self.weight_decay = 1e-4

        # Randomized smoothing noise
        self.noise_sd = 0.0  # Standard deviation of Gaussian noise

        # System parameters
        self.workers = 4  # Adjust based on CPU cores
        self.print_freq = 100

args = Config()

print("‚úì Training configuration:")
print(f"  Dataset: ImageNet (1000 classes)")
print(f"  Architecture: {args.arch}")
print(f"  Noise œÉ: {args.noise_sd}")
print(f"  Batch size: {args.batch}")
print(f"  Epochs: {args.epochs}")
print(f"  Image size: {args.image_size}√ó{args.image_size}")
print(f"  Output directory: {args.outdir}")

# print(f"  Output directory: {args.outdir}")

# ===== CIFAR-100 Configuration (Commented Out) =====# print(f"  Epochs: {args.epochs}")

# class Config:# print(f"  Batch size: {args.batch}")

#     def __init__(self):# print(f"  Noise œÉ: {args.noise_sd}")

#         # Dataset parameters# print(f"  Architecture: {args.arch}")

#         self.dataset = 'cifar100'# print(f"  Dataset: CIFAR-100 (100 classes)")

#         self.num_classes = 100# print("‚úì Training configuration:")

#         self.image_size = 32#

#         # args = Config()

#         # Model architecture (using ResNet-18 for CIFAR-100)#

#         self.arch = 'resnet18'#         self.print_freq = 50

#         #         self.workers = 2  # Fewer workers for smaller dataset

#         # Output directory#         # System parameters

#         self.outdir = outdir#

#         #         self.noise_sd = 0.25  # Standard deviation of Gaussian noise

#         # Training hyperparameters#         # Randomized smoothing noise

#         self.batch = 128  # Smaller batch for CIFAR-100#

#         self.epochs = 90#         self.weight_decay = 1e-4

#         self.lr = 0.1#         self.momentum = 0.9

#         self.lr_step_size = 30#         self.gamma = 0.1

## Step 6: Create Data Loaders

In [None]:
# Create data loaders (datasets already created in previous cell)
train_loader = DataLoader(train_dataset, batch_size=args.batch,
                         shuffle=True, num_workers=args.workers, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=args.batch,
                        shuffle=False, num_workers=args.workers, pin_memory=True)

print(f"‚úì Data loaders created")
print(f"Training batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

## Step 7: Initialize Model and Training Components

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===== ImageNet Model (Active) =====
# Initialize ResNet-50 for ImageNet
model = models.resnet18(pretrained=False)
# The model already has 1000 classes by default, which matches ImageNet

model = model.to(device)

# ===== CIFAR-100 Model (Commented Out) =====
# # Initialize ResNet-18 for CIFAR-100
# model = models.resnet18(pretrained=False)
# # Modify first conv layer for 32x32 images
# model.conv1 = torch.nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
# model.maxpool = torch.nn.Identity()  # Remove maxpool for small images
# # Modify final layer for 100 classes
# model.fc = torch.nn.Linear(model.fc.in_features, args.num_classes)
#
# model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"‚úì Model initialized: {args.arch}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

print(f"Device: {device}")

# Loss function, optimizer, and scheduler
criterion = CrossEntropyLoss().to(device)

optimizer = SGD(model.parameters(), lr=args.lr, momentum=args.momentum,
                weight_decay=args.weight_decay)
scheduler = StepLR(optimizer, step_size=args.lr_step_size, gamma=args.gamma)

print(f"Optimizer: SGD (lr={args.lr}, momentum={args.momentum}, weight_decay={args.weight_decay})")
print(f"Scheduler: StepLR (step={args.lr_step_size}, gamma={args.gamma})")
print(f"\n‚úì Training components initialized")

## Step 8: Define Training and Testing Functions

In [None]:
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

def train_epoch(loader, model, criterion, optimizer, epoch, noise_sd):
    """Train for one epoch"""
    model.train()
    train_loss = 0
    correct = 0
    total = 0

    for batch_idx, (inputs, targets) in enumerate(loader):
        inputs, targets = inputs.to(device), targets.to(device)

        # Add Gaussian noise for randomized smoothing
        inputs = inputs + torch.randn_like(inputs) * noise_sd

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        if batch_idx % args.print_freq == 0:
            print(f'Epoch: {epoch} [{batch_idx}/{len(loader)}] '
                  f'Loss: {train_loss/(batch_idx+1):.3f} | '
                  f'Acc: {100.*correct/total:.2f}% ({correct}/{total})')

    return train_loss/len(loader), 100.*correct/total

def test_epoch(loader, model, criterion, noise_sd):
    """Evaluate on test set"""
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(loader):
            inputs, targets = inputs.to(device), targets.to(device)

            # Add Gaussian noise
            inputs = inputs + torch.randn_like(inputs) * noise_sd

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    return test_loss/len(loader), 100.*correct/total

print("‚úì Training and testing functions defined")

## Step 9: Run Training Loop

Train the model for all epochs with randomized smoothing.

In [None]:
import os

# Create log file
os.makedirs(args.outdir, exist_ok=True)
logfile = os.path.join(args.outdir, 'training_log.txt')

with open(logfile, 'w') as f:
    f.write('epoch\ttime\tlr\ttrain_loss\ttrain_acc\ttest_loss\ttest_acc\n')

print(f"Starting training for {args.epochs} epochs...")
print(f"Noise œÉ = {args.noise_sd}")
print(f"Log file: {logfile}\n")

# Training history
history = {'train_loss': [], 'train_acc': [], 'test_loss': [], 'test_acc': []}

for epoch in range(args.epochs):
    start_time = time.time()

    # Train
    train_loss, train_acc = train_epoch(train_loader, model, criterion,
                                       optimizer, epoch, args.noise_sd)

    # Test
    test_loss, test_acc = test_epoch(test_loader, model, criterion, args.noise_sd)

    # Update scheduler
    scheduler.step()

    epoch_time = time.time() - start_time

    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['test_loss'].append(test_loss)
    history['test_acc'].append(test_acc)

    # Log results
    with open(logfile, 'a') as f:
        f.write(f'{epoch}\t{epoch_time:.1f}\t{scheduler.get_last_lr()[0]:.6f}\t'
                f'{train_loss:.4f}\t{train_acc:.2f}\t{test_loss:.4f}\t{test_acc:.2f}\n')

    print(f'\n=== Epoch {epoch}/{args.epochs} Summary ===')
    print(f'Time: {epoch_time:.1f}s | LR: {scheduler.get_last_lr()[0]:.6f}')
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%')
    print(f'Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.2f}%\n')

    # Save checkpoint
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_acc': train_acc,
        'test_acc': test_acc,
    }, os.path.join(args.outdir, 'checkpoint.pth'))

print("\n‚úì Training complete!")
print(f"Final Test Accuracy: {history['test_acc'][-1]:.2f}%")
print(f"Model saved to: {args.outdir}/checkpoint.pth")

## Step 10: Visualize Training Results

Plot training and testing curves.

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

epochs = range(len(history['train_loss']))

# Loss plot
ax1.plot(epochs, history['train_loss'], 'b-', label='Train Loss', linewidth=2)
ax1.plot(epochs, history['test_loss'], 'r-', label='Test Loss', linewidth=2)
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Loss', fontsize=12)
ax1.set_title('Training and Test Loss', fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Accuracy plot
ax2.plot(epochs, history['train_acc'], 'b-', label='Train Accuracy', linewidth=2)
ax2.plot(epochs, history['test_acc'], 'r-', label='Test Accuracy', linewidth=2)
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Accuracy (%)', fontsize=12)
ax2.set_title('Training and Test Accuracy', fontsize=14, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(args.outdir, 'training_curves.png'), dpi=150, bbox_inches='tight')
print(f"‚úì Training curves saved to {args.outdir}/training_curves.png")
plt.show()

# Print summary statistics
print(f"\n=== Training Summary ===")
print(f"Best Train Accuracy: {max(history['train_acc']):.2f}% (Epoch {history['train_acc'].index(max(history['train_acc']))})")
print(f"Best Test Accuracy: {max(history['test_acc']):.2f}% (Epoch {history['test_acc'].index(max(history['test_acc']))})")
print(f"Final Train Accuracy: {history['train_acc'][-1]:.2f}%")
print(f"Final Test Accuracy: {history['test_acc'][-1]:.2f}%")

## Next Steps

Your CIFAR-100 model with randomized smoothing is now trained!

**What you can do next:**

1. **Download the trained model:**
   ```python
   from google.colab import files
   files.download('/content/models/cifar100/resnet/noise_0.25/checkpoint.pth')
   ```

2. **Test on individual images:**
   - Load your model and test on custom CIFAR-100 images
   - Visualize certified regions around test samples

3. **Experiment with different noise levels:**
   - Change `args.noise_sd` to 0.0, 0.12, 0.50, or 1.0
   - Compare robustness vs accuracy trade-offs

4. **Try different architectures:**
   - Replace ResNet-18 with ResNet-34, ResNet-50, or other architectures
   - Adjust for different robustness requirements

**Key Results:**
- Model trained on CIFAR-100 (100 classes)
- Randomized smoothing noise œÉ = 0.25
- Provides provable adversarial robustness guarantees

## Step 11: Load Trained Model for Evaluation

Load the trained model checkpoint to evaluate its performance.

In [None]:
# Load the trained model checkpoint
checkpoint_path = os.path.join(args.outdir, 'checkpoint.pth')

if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"‚úì Model loaded from: {checkpoint_path}")
    print(f"  Trained for {checkpoint['epoch']} epochs")
    print(f"  Final train accuracy: {checkpoint['train_acc']:.2f}%")
    print(f"  Final test accuracy: {checkpoint['test_acc']:.2f}%")
else:
    print("‚ö†Ô∏è No checkpoint found. Please train the model first.")

## Step 12: Comprehensive Model Evaluation

Evaluate the model with detailed metrics including per-class accuracy, confusion matrix, and robustness analysis.

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_recall_fscore_support
import time

def evaluate_model_detailed(model, loader, noise_sd, device):
    """
    Comprehensive evaluation with detailed metrics.

    Returns:
        Dictionary containing various performance metrics
    """
    model.eval()

    all_predictions = []
    all_targets = []
    all_probs = []
    correct = 0
    total = 0
    inference_times = []

    print("Evaluating model...")
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(loader):
            inputs, targets = inputs.to(device), targets.to(device)

            # Measure inference time
            start_time = time.time()

            # Add Gaussian noise for randomized smoothing
            inputs = inputs + torch.randn_like(inputs) * noise_sd

            # Forward pass
            outputs = model(inputs)
            inference_times.append(time.time() - start_time)

            # Get predictions
            probs = torch.softmax(outputs, dim=1)
            _, predicted = outputs.max(1)

            # Store results
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())

            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            if batch_idx % 20 == 0:
                print(f"  Batch {batch_idx}/{len(loader)} processed...")

    # Calculate metrics
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    all_probs = np.array(all_probs)

    accuracy = 100. * correct / total
    avg_inference_time = np.mean(inference_times) * 1000  # Convert to ms

    # Per-class metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        all_targets, all_predictions, average=None, zero_division=0
    )

    # Overall metrics
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(
        all_targets, all_predictions, average='macro', zero_division=0
    )

    results = {
        'accuracy': accuracy,
        'correct': correct,
        'total': total,
        'avg_inference_time_ms': avg_inference_time,
        'predictions': all_predictions,
        'targets': all_targets,
        'probabilities': all_probs,
        'per_class_precision': precision,
        'per_class_recall': recall,
        'per_class_f1': f1,
        'per_class_support': support,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1
    }

    return results

# Run evaluation
print(f"\n{'='*60}")
print("COMPREHENSIVE MODEL EVALUATION")
print(f"{'='*60}\n")

eval_results = evaluate_model_detailed(model, test_loader, args.noise_sd, device)

print(f"\n{'='*60}")
print("EVALUATION RESULTS")
print(f"{'='*60}")
print(f"\nüìä Overall Performance:")
print(f"  Test Accuracy: {eval_results['accuracy']:.2f}%")
print(f"  Correct Predictions: {eval_results['correct']}/{eval_results['total']}")
print(f"  Macro Precision: {eval_results['macro_precision']:.4f}")
print(f"  Macro Recall: {eval_results['macro_recall']:.4f}")
print(f"  Macro F1-Score: {eval_results['macro_f1']:.4f}")
print(f"\n‚ö° Performance:")
print(f"  Average Inference Time: {eval_results['avg_inference_time_ms']:.2f} ms per batch")
print(f"  Throughput: {args.batch / (eval_results['avg_inference_time_ms']/1000):.1f} images/sec")
print(f"\nüîí Robustness:")
print(f"  Gaussian Noise œÉ: {args.noise_sd}")
print(f"  Certified Robustness: Model trained with randomized smoothing")
print(f"{'='*60}\n")

## Step 13: Visualize Confusion Matrix

Generate and visualize the confusion matrix to understand classification patterns.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate confusion matrix
cm = confusion_matrix(eval_results['targets'], eval_results['predictions'])

# Create figure for confusion matrix
fig, ax = plt.subplots(figsize=(12, 10))

# Plot confusion matrix (showing only diagonal and major errors due to 100 classes)
# We'll create a simplified view
sns.heatmap(cm, cmap='Blues', cbar=True, square=True, ax=ax,
            xticklabels=False, yticklabels=False)

ax.set_xlabel('Predicted Label', fontsize=12)
ax.set_ylabel('True Label', fontsize=12)
ax.set_title(f'Confusion Matrix - ImageNet (1000 classes)\nTest Accuracy: {eval_results["accuracy"]:.2f}%',
             fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(os.path.join(args.outdir, 'confusion_matrix.png'), dpi=150, bbox_inches='tight')
print(f"‚úì Confusion matrix saved to {args.outdir}/confusion_matrix.png")
plt.show()

# Print top 10 most confused class pairs
print("\nüìã Top 10 Most Confused Class Pairs:")
print(f"{'True Class':<12} {'Pred Class':<12} {'Count':<8} {'Error Rate'}")
print("-" * 60)

confused_pairs = []
for i in range(1000):
    for j in range(1000):
        if i != j and cm[i, j] > 0:
            error_rate = cm[i, j] / cm[i].sum() * 100
            confused_pairs.append((i, j, cm[i, j], error_rate))

confused_pairs.sort(key=lambda x: x[2], reverse=True)
for true_class, pred_class, count, error_rate in confused_pairs[:10]:
    print(f"{true_class:<12} {pred_class:<12} {count:<8} {error_rate:.2f}%")

## Step 14: Per-Class Performance Analysis

Analyze performance for each class to identify strengths and weaknesses.

In [None]:
# Calculate per-class accuracy
per_class_accuracy = []
for i in range(1000):
    mask = eval_results['targets'] == i
    if mask.sum() > 0:
        class_correct = (eval_results['predictions'][mask] == i).sum()
        class_total = mask.sum()
        per_class_accuracy.append(class_correct / class_total * 100)
    else:
        per_class_accuracy.append(0)

per_class_accuracy = np.array(per_class_accuracy)

# Print statistics
print(f"\n{'='*60}")
print("PER-CLASS PERFORMANCE STATISTICS")
print(f"{'='*60}\n")
print(f"Average per-class accuracy: {per_class_accuracy.mean():.2f}%")
print(f"Std deviation: {per_class_accuracy.std():.2f}%")
print(f"Min accuracy: {per_class_accuracy.min():.2f}% (Class {per_class_accuracy.argmin()})")
print(f"Max accuracy: {per_class_accuracy.max():.2f}% (Class {per_class_accuracy.argmax()})")
print(f"Median accuracy: {np.median(per_class_accuracy):.2f}%")

# Best and worst performing classes
print(f"\nüèÜ Top 10 Best Performing Classes:")
print(f"{'Class':<8} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score'}")
print("-" * 60)
best_classes = np.argsort(per_class_accuracy)[-10:][::-1]
for cls in best_classes:
    print(f"{cls:<8} {per_class_accuracy[cls]:<12.2f} "
          f"{eval_results['per_class_precision'][cls]:<12.4f} "
          f"{eval_results['per_class_recall'][cls]:<12.4f} "
          f"{eval_results['per_class_f1'][cls]:.4f}")

print(f"\n‚ö†Ô∏è Top 10 Worst Performing Classes:")
print(f"{'Class':<8} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score'}")
print("-" * 60)
worst_classes = np.argsort(per_class_accuracy)[:10]
for cls in worst_classes:
    print(f"{cls:<8} {per_class_accuracy[cls]:<12.2f} "
          f"{eval_results['per_class_precision'][cls]:<12.4f} "
          f"{eval_results['per_class_recall'][cls]:<12.4f} "
          f"{eval_results['per_class_f1'][cls]:.4f}")

# Visualize per-class accuracy distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Histogram
ax1.hist(per_class_accuracy, bins=20, color='steelblue', edgecolor='black', alpha=0.7)
ax1.axvline(per_class_accuracy.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {per_class_accuracy.mean():.2f}%')
ax1.set_xlabel('Accuracy (%)', fontsize=12)
ax1.set_ylabel('Number of Classes', fontsize=12)
ax1.set_title('Distribution of Per-Class Accuracy', fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Sorted accuracy plot
sorted_accuracy = np.sort(per_class_accuracy)
ax2.plot(range(1000), sorted_accuracy, linewidth=2, color='steelblue')
ax2.axhline(per_class_accuracy.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {per_class_accuracy.mean():.2f}%')
ax2.fill_between(range(1000), sorted_accuracy, alpha=0.3, color='steelblue')
ax2.set_xlabel('Class Rank', fontsize=12)
ax2.set_ylabel('Accuracy (%)', fontsize=12)
ax2.set_title('Sorted Per-Class Accuracy', fontsize=14, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(args.outdir, 'per_class_accuracy.png'), dpi=150, bbox_inches='tight')
print(f"\n‚úì Per-class accuracy visualization saved to {args.outdir}/per_class_accuracy.png")
plt.show()

## Step 15: Robustness Analysis - Compare Different Noise Levels

Test the model's performance under different noise conditions to verify robustness.

In [None]:
def test_robustness_at_noise_level(model, loader, noise_sd, device):
    """Test model accuracy at a specific noise level."""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # Add noise
            if noise_sd > 0:
                inputs = inputs + torch.randn_like(inputs) * noise_sd

            outputs = model(inputs)
            _, predicted = outputs.max(1)

            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    accuracy = 100. * correct / total
    return accuracy

# Test at different noise levels
print(f"\n{'='*60}")
print("ROBUSTNESS ANALYSIS - VARYING NOISE LEVELS")
print(f"{'='*60}\n")
print(f"Model was trained with œÉ = {args.noise_sd}\n")

noise_levels = [0.0, 0.12, 0.25, 0.50, 0.75, 1.0]
accuracies = []

print(f"{'Noise œÉ':<12} {'Accuracy':<12} {'Degradation'}")
print("-" * 50)

for noise_sd in noise_levels:
    acc = test_robustness_at_noise_level(model, test_loader, noise_sd, device)
    accuracies.append(acc)

    # Calculate degradation from no-noise baseline
    degradation = accuracies[0] - acc if len(accuracies) > 1 else 0

    marker = "üìç" if noise_sd == args.noise_sd else ""
    print(f"{noise_sd:<12.2f} {acc:<12.2f} {degradation:+.2f}%  {marker}")

# Visualize robustness curve
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(noise_levels, accuracies, marker='o', linewidth=2, markersize=8,
        color='steelblue', label='Model Performance')
ax.axvline(args.noise_sd, color='red', linestyle='--', linewidth=2,
           label=f'Training Noise œÉ={args.noise_sd}')

# Highlight training noise point
train_noise_idx = noise_levels.index(args.noise_sd) if args.noise_sd in noise_levels else None
if train_noise_idx is not None:
    ax.scatter([args.noise_sd], [accuracies[train_noise_idx]],
              color='red', s=200, zorder=5, alpha=0.7)

ax.set_xlabel('Gaussian Noise Standard Deviation (œÉ)', fontsize=12)
ax.set_ylabel('Test Accuracy (%)', fontsize=12)
ax.set_title('Robustness Analysis: Performance vs Noise Level', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_ylim([0, max(accuracies) + 5])

plt.tight_layout()
plt.savefig(os.path.join(args.outdir, 'robustness_analysis.png'), dpi=150, bbox_inches='tight')
print(f"\n‚úì Robustness analysis saved to {args.outdir}/robustness_analysis.png")
plt.show()

print(f"\n{'='*60}")
print("KEY INSIGHTS")
print(f"{'='*60}")
print(f"‚úì Clean accuracy (œÉ=0): {accuracies[0]:.2f}%")
print(f"‚úì Accuracy at training noise (œÉ={args.noise_sd}): {accuracies[noise_levels.index(args.noise_sd)]:.2f}%")
print(f"‚úì Accuracy degradation at œÉ=0.50: {accuracies[0] - accuracies[noise_levels.index(0.50)]:.2f}%")
print(f"‚úì Model shows {'good' if accuracies[noise_levels.index(args.noise_sd)] > 40 else 'limited'} robustness to noise")
print(f"{'='*60}\n")

## Step 16: Model Prediction Examples

Visualize some test samples with predictions and confidence scores.

In [None]:
# ===== ImageNet Class Names (Active) =====
# Get ImageNet class names (using class indices)
imagenet_classes = train_dataset.classes  # Gets class folder names (e.g., 'n01440764')

# For human-readable names, you can create a mapping or use a pre-made dictionary
# Here we'll just use the class indices for visualization
def get_class_name(class_idx):
    """Get readable class name for ImageNet class index."""
    if class_idx < len(imagenet_classes):
        return imagenet_classes[class_idx]
    return f"Class_{class_idx}"

# ===== CIFAR-100 Class Names (Commented Out) =====
# cifar100_fine_labels = [
#     'apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle',
#     'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel',
#     'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock',
#     'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur',
#     'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster',
#     'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion',
#     'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse',
#     'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear',
#     'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine',
#     'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose',
#     'sea', 'seal', 'shark', 'shrew', 'skunk', 'skyscraper', 'snail', 'snake',
#     'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table',
#     'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout',
#     'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm'
# ]
#
# def get_class_name(class_idx):
#     """Get readable class name for CIFAR-100 class index."""
#     if class_idx < len(cifar100_fine_labels):
#         return cifar100_fine_labels[class_idx]
#     return f"Class_{class_idx}"

# Get some test samples
model.eval()
num_samples = 12

# Get a batch from test set
dataiter = iter(test_loader)
images, labels = next(dataiter)
images = images[:num_samples].to(device)
labels = labels[:num_samples]

# Make predictions
with torch.no_grad():
    # Add noise
    noisy_images = images + torch.randn_like(images) * args.noise_sd
    outputs = model(noisy_images)
    probs = torch.softmax(outputs, dim=1)
    confidences, predictions = probs.max(1)

# ===== ImageNet Denormalization (Active) =====
# Denormalize images for visualization
mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1).to(device)
std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1).to(device)
images_denorm = images * std + mean
images_denorm = torch.clamp(images_denorm, 0, 1)

# ===== CIFAR-100 Denormalization (Commented Out) =====
# # Denormalize images for visualization
# mean = torch.tensor([0.5071, 0.4867, 0.4408]).view(3, 1, 1).to(device)
# std = torch.tensor([0.2675, 0.2565, 0.2761]).view(3, 1, 1).to(device)
# images_denorm = images * std + mean
# images_denorm = torch.clamp(images_denorm, 0, 1)

# Visualize predictions
fig, axes = plt.subplots(3, 4, figsize=(16, 12))
axes = axes.flatten()

for idx in range(num_samples):
    ax = axes[idx]

    # Convert to numpy and transpose for plotting
    img = images_denorm[idx].cpu().permute(1, 2, 0).numpy()

    true_label = labels[idx].item()
    pred_label = predictions[idx].item()
    confidence = confidences[idx].item()

    # Plot image
    ax.imshow(img)
    ax.axis('off')

    # Set title with color based on correctness
    is_correct = true_label == pred_label
    color = 'green' if is_correct else 'red'
    marker = '‚úì' if is_correct else '‚úó'

    title = f"{marker} True: {get_class_name(true_label)}\n"
    title += f"Pred: {get_class_name(pred_label)}\n"
    title += f"Conf: {confidence:.2%}"

    ax.set_title(title, fontsize=10, color=color, fontweight='bold')

plt.suptitle('Sample Predictions with Randomized Smoothing', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(os.path.join(args.outdir, 'sample_predictions.png'), dpi=150, bbox_inches='tight')
print(f"‚úì Sample predictions saved to {args.outdir}/sample_predictions.png")
plt.show()

# Print summary
correct_in_sample = (predictions.cpu() == labels).sum().item()
print(f"\nüìä Sample Batch Statistics:")
print(f"  Correct: {correct_in_sample}/{num_samples} ({100*correct_in_sample/num_samples:.1f}%)")
print(f"  Average confidence: {confidences.mean().item():.2%}")
print(f"  Min confidence: {confidences.min().item():.2%}")
print(f"  Max confidence: {confidences.max().item():.2%}")

## Step 17: Generate Comprehensive Evaluation Report

Create a summary report of all evaluation metrics.

In [None]:
# Generate comprehensive evaluation report
report_path = os.path.join(args.outdir, 'evaluation_report.txt')

with open(report_path, 'w') as f:
    f.write("="*80 + "\n")
    f.write("CIFAR-100 RANDOMIZED SMOOTHING CLASSIFIER - EVALUATION REPORT\n")
    f.write("="*80 + "\n\n")

    f.write("MODEL CONFIGURATION\n")
    f.write("-"*80 + "\n")
    f.write(f"Architecture: ResNet-18 (standard ImageNet)\n")
    f.write(f"Dataset: ImageNet (1000 classes, 224√ó224 images)\n")
    f.write(f"Training epochs: {checkpoint['epoch']}\n")
    f.write(f"Batch size: {args.batch}\n")
    f.write(f"Randomized smoothing noise (œÉ): {args.noise_sd}\n\n")

    f.write("OVERALL PERFORMANCE METRICS\n")
    f.write("-"*80 + "\n")
    f.write(f"Test Accuracy: {eval_results['accuracy']:.2f}%\n")
    f.write(f"Correct Predictions: {eval_results['correct']}/{eval_results['total']}\n")
    f.write(f"Macro Precision: {eval_results['macro_precision']:.4f}\n")
    f.write(f"Macro Recall: {eval_results['macro_recall']:.4f}\n")
    f.write(f"Macro F1-Score: {eval_results['macro_f1']:.4f}\n\n")

    f.write("PERFORMANCE CHARACTERISTICS\n")
    f.write("-"*80 + "\n")
    f.write(f"Average Inference Time: {eval_results['avg_inference_time_ms']:.2f} ms per batch\n")
    f.write(f"Throughput: {args.batch / (eval_results['avg_inference_time_ms']/1000):.1f} images/sec\n")
    f.write(f"Per-class accuracy (mean): {per_class_accuracy.mean():.2f}%\n")
    f.write(f"Per-class accuracy (std): {per_class_accuracy.std():.2f}%\n")
    f.write(f"Best class accuracy: {per_class_accuracy.max():.2f}% (Class {per_class_accuracy.argmax()})\n")
    f.write(f"Worst class accuracy: {per_class_accuracy.min():.2f}% (Class {per_class_accuracy.argmin()})\n\n")

    f.write("ROBUSTNESS ANALYSIS\n")
    f.write("-"*80 + "\n")
    f.write(f"Model trained with Gaussian noise œÉ = {args.noise_sd}\n")
    f.write(f"Performance at different noise levels:\n")
    for noise_sd, acc in zip(noise_levels, accuracies):
        marker = " (training level)" if noise_sd == args.noise_sd else ""
        f.write(f"  œÉ = {noise_sd:.2f}: {acc:.2f}%{marker}\n")
    f.write(f"\nClean accuracy (œÉ=0): {accuracies[0]:.2f}%\n")
    f.write(f"Accuracy degradation at œÉ=0.50: {accuracies[0] - accuracies[noise_levels.index(0.50)]:.2f}%\n\n")

    f.write("TOP 5 BEST PERFORMING CLASSES\n")
    f.write("-"*80 + "\n")
    for i, cls in enumerate(best_classes[:5], 1):
        f.write(f"{i}. Class {cls} ({get_class_name(cls)}): {per_class_accuracy[cls]:.2f}%\n")

    f.write("\nTOP 5 WORST PERFORMING CLASSES\n")
    f.write("-"*80 + "\n")
    for i, cls in enumerate(worst_classes[:5], 1):
        f.write(f"{i}. Class {cls} ({get_class_name(cls)}): {per_class_accuracy[cls]:.2f}%\n")

    f.write("\n" + "="*80 + "\n")
    f.write("CERTIFICATION CAPABILITY\n")
    f.write("="*80 + "\n")
    f.write("This model has been trained with randomized smoothing, which provides\n")
    f.write("PROVABLE adversarial robustness guarantees. The model can certify predictions\n")
    f.write("within a certified radius around input samples.\n\n")
    f.write("For certification, use the certify.py script with this trained model.\n")
    f.write("="*80 + "\n")

print(f"\n{'='*80}")
print("EVALUATION COMPLETE")
print(f"{'='*80}\n")
print(f"‚úì Comprehensive evaluation report saved to: {report_path}")
print(f"‚úì Confusion matrix saved to: {os.path.join(args.outdir, 'confusion_matrix.png')}")
print(f"‚úì Per-class accuracy analysis saved to: {os.path.join(args.outdir, 'per_class_accuracy.png')}")
print(f"‚úì Robustness analysis saved to: {os.path.join(args.outdir, 'robustness_analysis.png')}")
print(f"‚úì Sample predictions saved to: {os.path.join(args.outdir, 'sample_predictions.png')}")
print(f"\nüìä Summary:")
print(f"   Test Accuracy: {eval_results['accuracy']:.2f}%")
print(f"   Macro F1-Score: {eval_results['macro_f1']:.4f}")
print(f"   Robustness: Model maintains {accuracies[noise_levels.index(args.noise_sd)]:.2f}% accuracy at training noise level")
print(f"\n{'='*80}\n")