In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

"""Import dependencies"""
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet18
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [3]:
"""Define Dataset Class for Vision Transformer with Debugging"""
class MyDatasetViT(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data = []
        self.labels = []
        self.class_names = ['axion', 'cdm', 'no_sub']
        self.transform = transform

        print(f"Loading dataset from: {data_dir}")
        print(f"Looking for classes: {self.class_names}")

        for idx, class_name in enumerate(self.class_names):
            class_dir = os.path.join(data_dir, class_name)
            print(f"--- Processing class: {class_name} ---")

            if not os.path.exists(class_dir):
                print(f"[ERROR] Directory not found: {class_dir}")
                continue

            files = os.listdir(class_dir)

            for file_name in files:
                if file_name.endswith('.npy'):
                    file_path = os.path.join(class_dir, file_name)
                    loaded_data = np.load(file_path, allow_pickle=True)

                    if class_name == 'axion':
                        image = loaded_data[0]
                    else:
                        image = loaded_data

                    # [DEBUG] Print the shape of the raw numpy array
                    print(f"  [DEBUG] Loaded '{file_name}'. Raw numpy shape: {image.shape}")

                    # Ensure the image is a 2D array (H, W) before adding channel dimension.
                    if image.ndim != 2:
                        image = np.squeeze(image)

                    # Convert to a float tensor and add a channel dimension -> [1, H, W]
                    image_tensor = torch.tensor(image, dtype=torch.float32).unsqueeze(0)

                    # [DEBUG] Print the shape of the final tensor being stored in the dataset
                    print(f"  [DEBUG] Storing tensor with final shape: {image_tensor.shape}\n")

                    self.data.append(image_tensor)
                    self.labels.append(idx)

        print("\n--- Dataset Loading Complete ---")
        print(f"Total images loaded: {len(self.data)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        This method is called by the DataLoader to get one item from the dataset.
        The debug prints here are CRITICAL for finding the error.
        """
        #print(f"--- Getting item index: {idx} ---")

        # Retrieve the pre-loaded tensor and its label
        image = self.data[idx]
        label = self.labels[idx]

        # [DEBUG] Print shape BEFORE the transform is applied
        #print(f"  [DEBUG] Shape of tensor BEFORE transform: {image.shape}")

        # Apply transformations (e.g., resizing) if they are provided
        if self.transform:
            image = self.transform(image)
            # [DEBUG] Print shape AFTER the transform is applied
            #print(f"  [DEBUG] Shape of tensor AFTER transform: {image.shape}")
        else:
            #print("  [DEBUG] No transform was applied.")
            pass

        return image, label

In [4]:

# Import the transforms module
from torchvision import transforms
# Hyperparameters
batch_size = 32
learning_rate = 0.001
num_epochs = 100

# Data Directories
train_dir = '/content/drive/MyDrive/Model_V/Model_V'
#val_dir = '../dataset/dataset/val'

print(f"Training Directory: {train_dir}")
#print(f"Validation Directory: {val_dir}")

vit_transforms = transforms.Compose([
    # transforms.ToTensor(), # Removed ToTensor()
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(degrees=90), # Can be any angle range
])

# Create Datasets and Dataloaders
#dataset = MyDatasetViT(train_dir, vit_transforms)
#train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [0.75, 0.15, 0.1])

#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
#val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)

#print(f"Batch Size: {batch_size}")
#print(f"Number of Training Batches: {len(train_loader)}")
#print(f"Number of Validation Batches: {len(val_loader)}")

#Save the dataloader so that we don't have to bear with this pain again
#torch.save(train_loader, '/content/drive/MyDrive/Model_III_dataset/train_loader.pth')
#torch.save(val_loader, '/content/drive/MyDrive/Model_III_dataset/val_loader.pth')

Training Directory: /content/drive/MyDrive/Model_V/Model_V


In [5]:
#import data loaders from file
train_loader = torch.load('/content/drive/MyDrive/train_loader.pth', weights_only=False)
val_loader = torch.load('/content/drive/MyDrive/val_loader.pth', weights_only=False)

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, embed_dim):
        super().__init__()
        self.image_size = image_size
        self.patch_size = patch_size
        # Calculate num_patches based on the actual image and patch sizes
        self.num_patches = (image_size // patch_size) * (image_size // patch_size)

        self.projection = nn.Conv2d(
            in_channels,
            embed_dim,
            kernel_size=patch_size,
            stride=patch_size
        )

        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        # Initialize positional embedding with the correct size
        self.positional_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))

    def forward(self, x):
        #print(f"[DEBUG] Input shape to PatchEmbedding: {x.shape}")
        x = self.projection(x)
        #print(f"[DEBUG] Shape after projection: {x.shape}")
        x = x.flatten(2)
        #print(f"[DEBUG] Shape after flatten: {x.shape}")
        x = x.transpose(1, 2)
        #print(f"[DEBUG] Shape after transpose: {x.shape}")


        batch_size = x.shape[0]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        #print(f"[DEBUG] Shape of cls_tokens: {cls_tokens.shape}")

        x = torch.cat((cls_tokens, x), dim=1)

        # Add debug prints here
        #print(f"[DEBUG] Shape of x before adding positional embedding: {x.shape}")
        #print(f"[DEBUG] Shape of positional_embedding: {self.positional_embedding.shape}")

        x = x + self.positional_embedding

        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads."

        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.attn_dropout = nn.Dropout(dropout)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.proj_dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, N, C = x.shape

        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)

        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_dropout(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)

        x = self.proj(x)
        x = self.proj_dropout(x)

        return x

class MLP(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x

class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_ratio=4.0, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadAttention(embed_dim, num_heads, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        mlp_hidden_dim = int(embed_dim * mlp_ratio)
        self.mlp = MLP(in_features=embed_dim, hidden_features=mlp_hidden_dim, out_features=embed_dim, dropout=dropout)


    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

class VisionTransformer(nn.Module):
    def __init__(self, image_size=224, patch_size=16, in_channels=1, num_classes=3,
                 embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, dropout=0.1):
        super().__init__()

        self.patch_embed = PatchEmbedding(image_size, patch_size, in_channels, embed_dim)

        self.encoder_blocks = nn.ModuleList([
            TransformerEncoderBlock(
                embed_dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                dropout=dropout,
            ) for i in range(depth)])

        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embed(x)

        for block in self.encoder_blocks:
            x = block(x)

        x = self.norm(x)

        cls_token_final = x[:, 0]
        output = self.head(cls_token_final)

        return output

In [None]:
"""
Args:
        image_size (int): Size of the input image (e.g., 64).
        patch_size (int): Size of each patch (e.g., 8).
        in_channels (int): Number of input channels (e.g., 1 for your task).
        num_classes (int): Number of output classes (e.g., 3 for your task).
        embed_dim (int): The main embedding dimension (e.g., 768 for ViT-Base).
        depth (int): Number of Transformer Encoder blocks (e.g., 12 for ViT-Base).
        num_heads (int): Number of attention heads (e.g., 12 for ViT-Base).
        mlp_ratio (float): Ratio to determine MLP hidden dimension (e.g., 4.0).
        dropout (float): Dropout probability.
"""
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR
batch_size = 32
learning_rate = 1e-4
weight_decay = 0.05
num_epochs = 2000
warmup_epochs = 10
model = VisionTransformer(
        image_size=64, patch_size=8, in_channels=1, num_classes=3,
                 embed_dim=48, depth=2, num_heads=12, mlp_ratio=4.0, dropout=0.0
    )


criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)


#scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
def warmup_lambda(current_epoch):
    if current_epoch < warmup_epochs:
        return float(current_epoch) / float(max(1, warmup_epochs))
    return 1.0
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lambda)
main_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs, eta_min=1e-6)
scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[warmup_epochs])


print("Optimizer: Adam")
print(f"Learning Rate: {learning_rate}")

# Train Model
model, all_probs, all_labels = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)

Optimizer: Adam
Learning Rate: 0.0001
Training on device: cuda
===== Epoch 1/2000 =====




New best model saved with Val ROC AUC: 0.4926, Train Loss: 0.9685, Train Accuracy: 0.5069, Val Loss: 0.9619, Val Accuracy: 0.5186
===== Epoch 2/2000 =====
New best model saved with Val ROC AUC: 0.4946, Train Loss: 0.9644, Train Accuracy: 0.5069, Val Loss: 0.9614, Val Accuracy: 0.5186
===== Epoch 3/2000 =====
New best model saved with Val ROC AUC: 0.4992, Train Loss: 0.9644, Train Accuracy: 0.5069, Val Loss: 0.9612, Val Accuracy: 0.5186
===== Epoch 4/2000 =====
No improvement in Val ROC AUC for 1 epoch(s). Best is 0.4992. Train Loss: 0.9650, Train Accuracy: 0.5069, Val Loss: 0.9650, Val Accuracy: 0.5186
===== Epoch 5/2000 =====
No improvement in Val ROC AUC for 2 epoch(s). Best is 0.4992. Train Loss: 0.9647, Train Accuracy: 0.5069, Val Loss: 0.9612, Val Accuracy: 0.5186
===== Epoch 6/2000 =====
No improvement in Val ROC AUC for 3 epoch(s). Best is 0.4992. Train Loss: 0.9641, Train Accuracy: 0.5069, Val Loss: 0.9611, Val Accuracy: 0.5186
===== Epoch 7/2000 =====
New best model saved with

In [32]:


#!pip install torch_xla[tpu]

In [33]:
#import torch_xla

In [41]:
"""
Args:
        image_size (int): Size of the input image (e.g., 224).
        patch_size (int): Size of each patch (e.g., 16).
        in_channels (int): Number of input channels (e.g., 1 for your task).
        num_classes (int): Number of output classes (e.g., 3 for your task).
        embed_dim (int): The main embedding dimension (e.g., 768 for ViT-Base).
        depth (int): Number of Transformer Encoder blocks (e.g., 12 for ViT-Base).
        num_heads (int): Number of attention heads (e.g., 12 for ViT-Base).
        mlp_ratio (float): Ratio to determine MLP hidden dimension (e.g., 4.0).
        dropout (float): Dropout probability.
"""
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR
batch_size = 32
learning_rate = 1e-4
weight_decay = 0.05
num_epochs = 2000
warmup_epochs = 10
model = VisionTransformer(
        image_size=150, patch_size=10, in_channels=1, num_classes=3,
                 embed_dim=48, depth=2, num_heads=12, mlp_ratio=4.0, dropout=0.0
    )


criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)


#scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
def warmup_lambda(current_epoch):
    if current_epoch < warmup_epochs:
        return float(current_epoch) / float(max(1, warmup_epochs))
    return 1.0
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lambda)
main_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs, eta_min=1e-6)
scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[warmup_epochs])


print("Optimizer: Adam")
print(f"Learning Rate: {learning_rate}")

# Train Model
model, all_probs, all_labels = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)

Optimizer: Adam
Learning Rate: 0.0001
Training on device: cuda
===== Epoch 1/2000 =====


RuntimeError: The size of tensor a (37) must match the size of tensor b (226) at non-singleton dimension 1

In [35]:
"""
Args:
        image_size (int): Size of the input image (e.g., 224).
        patch_size (int): Size of each patch (e.g., 16).
        in_channels (int): Number of input channels (e.g., 1 for your task).
        num_classes (int): Number of output classes (e.g., 3 for your task).
        embed_dim (int): The main embedding dimension (e.g., 768 for ViT-Base).
        depth (int): Number of Transformer Encoder blocks (e.g., 12 for ViT-Base).
        num_heads (int): Number of attention heads (e.g., 12 for ViT-Base).
        mlp_ratio (float): Ratio to determine MLP hidden dimension (e.g., 4.0).
        dropout (float): Dropout probability.
"""
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR
batch_size = 32
learning_rate = 1e-4
weight_decay = 0.05
num_epochs = 2000
warmup_epochs = 10
model = VisionTransformer(
        image_size=150, patch_size=10, in_channels=1, num_classes=3,
                 embed_dim=48, depth=2, num_heads=12, mlp_ratio=4.0, dropout=0.0
    )


criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)


#scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
def warmup_lambda(current_epoch):
    if current_epoch < warmup_epochs:
        return float(current_epoch) / float(max(1, warmup_epochs))
    return 1.0
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lambda)
main_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs, eta_min=1e-6)
scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[warmup_epochs])


print("Optimizer: Adam")
print(f"Learning Rate: {learning_rate}")

# Train Model
model, all_probs, all_labels = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)

Optimizer: Adam
Learning Rate: 0.0001
Training on device: cuda
===== Epoch 1/2000 =====
[DEBUG] Shape of x before adding positional embedding: torch.Size([32, 37, 48])
[DEBUG] Shape of positional_embedding: torch.Size([1, 226, 48])


RuntimeError: The size of tensor a (37) must match the size of tensor b (226) at non-singleton dimension 1

In [24]:
"""
Args:
        image_size (int): Size of the input image (e.g., 224).
        patch_size (int): Size of each patch (e.g., 16).
        in_channels (int): Number of input channels (e.g., 1 for your task).
        num_classes (int): Number of output classes (e.g., 3 for your task).
        embed_dim (int): The main embedding dimension (e.g., 768 for ViT-Base).
        depth (int): Number of Transformer Encoder blocks (e.g., 12 for ViT-Base).
        num_heads (int): Number of attention heads (e.g., 12 for ViT-Base).
        mlp_ratio (float): Ratio to determine MLP hidden dimension (e.g., 4.0).
        dropout (float): Dropout probability.
"""
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR
batch_size = 32
learning_rate = 1e-4
weight_decay = 0.05
num_epochs = 2000
warmup_epochs = 10
model = VisionTransformer(
        image_size=150, patch_size=10, in_channels=1, num_classes=3,
                 embed_dim=48, depth=2, num_heads=12, mlp_ratio=4.0, dropout=0.0
    )


criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)


#scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
def warmup_lambda(current_epoch):
    if current_epoch < warmup_epochs:
        return float(current_epoch) / float(max(1, warmup_epochs))
    return 1.0
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lambda)
main_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs, eta_min=1e-6)
scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[warmup_epochs])


print("Optimizer: Adam")
print(f"Learning Rate: {learning_rate}")

# Train Model
model, all_probs, all_labels = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)

Optimizer: Adam
Learning Rate: 0.0001
Training on device: cuda
===== Epoch 1/2000 =====


RuntimeError: The size of tensor a (37) must match the size of tensor b (226) at non-singleton dimension 1

In [30]:
"""
Args:
        image_size (int): Size of the input image (e.g., 224).
        patch_size (int): Size of each patch (e.g., 16).
        in_channels (int): Number of input channels (e.g., 1 for your task).
        num_classes (int): Number of output classes (e.g., 3 for your task).
        embed_dim (int): The main embedding dimension (e.g., 768 for ViT-Base).
        depth (int): Number of Transformer Encoder blocks (e.g., 12 for ViT-Base).
        num_heads (int): Number of attention heads (e.g., 12 for ViT-Base).
        mlp_ratio (float): Ratio to determine MLP hidden dimension (e.g., 4.0).
        dropout (float): Dropout probability.
"""
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR
batch_size = 32
learning_rate = 1e-4
weight_decay = 0.05
num_epochs = 2000
warmup_epochs = 10
model = VisionTransformer(
        image_size=150, patch_size=10, in_channels=1, num_classes=3,
                 embed_dim=48, depth=2, num_heads=12, mlp_ratio=4.0, dropout=0.0
    )


criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)


#scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
def warmup_lambda(current_epoch):
    if current_epoch < warmup_epochs:
        return float(current_epoch) / float(max(1, warmup_epochs))
    return 1.0
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lambda)
main_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs, eta_min=1e-6)
scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[warmup_epochs])


print("Optimizer: Adam")
print(f"Learning Rate: {learning_rate}")

# Train Model
model, all_probs, all_labels = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)

Optimizer: Adam
Learning Rate: 0.0001
Training on device: cuda
===== Epoch 1/2000 =====
[DEBUG] Shape of x before adding positional embedding: torch.Size([32, 37, 48])
[DEBUG] Shape of positional_embedding: torch.Size([1, 226, 48])


RuntimeError: The size of tensor a (37) must match the size of tensor b (226) at non-singleton dimension 1

In [9]:
import torch
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
import copy
import matplotlib.pyplot as plt
from itertools import cycle

def plot_roc_curve(all_labels, all_probs, class_names):
    """
    Plots the ROC curve for each class and the micro/macro averages.
    """
    # Binarize the labels for multi-class ROC analysis
    all_labels_bin = label_binarize(all_labels, classes=range(len(class_names)))
    all_probs = np.array(all_probs)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(class_names)):
        fpr[i], tpr[i], _ = roc_curve(all_labels_bin[:, i], all_probs[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(all_labels_bin.ravel(), all_probs.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Compute macro-average ROC curve and ROC area
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(class_names))]))
    # Then interpolate all ROC curves at these points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(class_names)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    # Finally average it and compute AUC
    mean_tpr /= len(class_names)
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure(figsize=(10, 8))

    plt.plot(fpr["micro"], tpr["micro"],
             label=f'micro-average ROC curve (area = {roc_auc["micro"]:.2f})',
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label=f'macro-average ROC curve (area = {roc_auc["macro"]:.2f})',
             color='navy', linestyle=':', linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(len(class_names)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'ROC curve of class {class_names[i]} (area = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-class Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.savefig("/content/drive/MyDrive/Model_III_dataset/roc_curve_heal_swin_pinn.png")
    print("\nROC curve plot saved as roc_curve.png")

"""Training and Evaluation with Early Stopping"""
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=50, patience=2000):
    """
    Trains the model with early stopping based on validation ROC AUC score.

    Args:
        model (torch.nn.Module): The neural network model to train.
        train_loader (torch.utils.data.DataLoader): DataLoader for the training set.
        val_loader (torch.utils.data.DataLoader): DataLoader for the validation set.
        criterion: The loss function.
        optimizer: The optimization algorithm.
        scheduler: The learning rate scheduler.
        num_epochs (int): The maximum number of epochs to train for.
        patience (int): Number of epochs to wait for improvement before stopping.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on device: {device}")

    model.to(device)

    best_roc_auc = 0.0
    epochs_no_improve = 0
    best_model_wts = copy.deepcopy(model.state_dict())
    best_probs = []
    best_labels = []

    class_names = ['axion', 'cdm', 'no_sub']

    for epoch in range(num_epochs):
        print(f"===== Epoch {epoch+1}/{num_epochs} =====")

        # --- Training Phase ---
        model.train()
        train_loss = 0.0
        train_correct = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_correct += (predicted == labels).sum().item()

        # --- Validation Phase ---
        model.eval()
        val_loss = 0.0
        val_correct = 0
        all_probs = []
        all_labels = []

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * images.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_correct += (predicted == labels).sum().item()

                probs = torch.softmax(outputs, dim=1)
                all_probs.extend(probs.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # --- Calculate Metrics ---
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        train_accuracy = train_correct / len(train_loader.dataset)
        val_accuracy = val_correct / len(val_loader.dataset)

        # Calculate multi-class ROC AUC score
        all_labels_np = np.array(all_labels)
        all_probs_np = np.array(all_probs)
        try:
            val_roc_auc = roc_auc_score(all_labels_np, all_probs_np, multi_class='ovr', average='macro')
        except ValueError as e:
            print(f"Could not calculate ROC AUC: {e}")
            val_roc_auc = 0.0

        # Epoch-level summary
        #print(f'\n[SUMMARY] Epoch {epoch+1}/{num_epochs}:')
        #print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
        #print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val ROC AUC: {val_roc_auc:.4f}')

        if val_roc_auc > best_roc_auc:
            best_roc_auc = val_roc_auc
            epochs_no_improve = 0
            best_model_wts = copy.deepcopy(model.state_dict())
            best_probs = all_probs
            best_labels = all_labels
            #torch.save(model.state_dict(), '/content/drive/MyDrive/Model_III_dataset/lens_classifier_model_vision_transformer.pth')
            print(f"New best model saved with Val ROC AUC: {best_roc_auc:.4f}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        else:
            epochs_no_improve += 1
            print(f"No improvement in Val ROC AUC for {epochs_no_improve} epoch(s). Best is {best_roc_auc:.4f}. Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

        if epochs_no_improve >= patience:
            print(f"\nEarly stopping triggered after {patience} epochs without improvement.")
            model.load_state_dict(best_model_wts)
            break

    print("\nTraining Complete!")
    model.load_state_dict(best_model_wts)

    # After the training loop, plot the ROC curve for the best model
    if best_probs and best_labels:
        plot_roc_curve(best_labels, best_probs, class_names)

    return model, best_probs, best_labels

In [10]:
#torch.save(model.state_dict(), '/content/drive/MyDrive/Model_III_dataset/model_weights.pth')

In [11]:
"""
Args:
        image_size (int): Size of the input image (e.g., 224).
        patch_size (int): Size of each patch (e.g., 16).
        in_channels (int): Number of input channels (e.g., 1 for your task).
        num_classes (int): Number of output classes (e.g., 3 for your task).
        embed_dim (int): The main embedding dimension (e.g., 768 for ViT-Base).
        depth (int): Number of Transformer Encoder blocks (e.g., 12 for ViT-Base).
        num_heads (int): Number of attention heads (e.g., 12 for ViT-Base).
        mlp_ratio (float): Ratio to determine MLP hidden dimension (e.g., 4.0).
        dropout (float): Dropout probability.
"""
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR
batch_size = 32
learning_rate = 1e-4
weight_decay = 0.05
num_epochs = 2000
warmup_epochs = 10
model = VisionTransformer(
        image_size=150, patch_size=10, in_channels=1, num_classes=3,
                 embed_dim=48, depth=2, num_heads=12, mlp_ratio=4.0, dropout=0.0
    )


criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)


#scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
def warmup_lambda(current_epoch):
    if current_epoch < warmup_epochs:
        return float(current_epoch) / float(max(1, warmup_epochs))
    return 1.0
warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lambda)
main_scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs, eta_min=1e-6)
scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[warmup_epochs])


print("Optimizer: Adam")
print(f"Learning Rate: {learning_rate}")

# Train Model
model, all_probs, all_labels = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs)

Optimizer: Adam
Learning Rate: 0.0001
Training on device: cuda
===== Epoch 1/2000 =====


RuntimeError: The size of tensor a (37) must match the size of tensor b (226) at non-singleton dimension 1

In [None]:
""" ROC Curve Plotting Function"""
def plot_roc_curve(all_preds, all_labels):
    print("Generating ROC Curve")

    # Convert predictions and labels to numpy arrays
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    n_classes = 3

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve((all_labels == i).astype(int), all_preds[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

        print(f"Class {i} ROC AUC: {roc_auc[i]:.4f}")

    # Plot ROC curves
    plt.figure(figsize=(10, 8))
    colors = ['blue', 'red', 'green']
    class_names = ['Axion', 'CDM', 'No Substructure']

    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color,
                 label=f'{class_names[i]} (AUC = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.savefig('/content/drive/MyDrive/Model_III_dataset/roc_curve_vit.png')
    plt.close()

    print("ROC Curve saved as roc_curve.png")


plot_roc_curve(all_probs, all_labels)

print("Training and Evaluation Complete!")