In [2]:
# Step 1: Import Required Libraries
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


In [23]:
import torch
import torch.nn as nn

class LightNet(nn.Module):
    def __init__(self, num_classes=20):  # Adjust for your specific class count
        super(LightNet, self).__init__()
        
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        
        # Fully connected layers for bounding box regression and classification
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 32 * 32, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes + 4)  # num_classes + 4 for bounding box
        )
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.fc(x)
        return x  # Output shape: (batch_size, num_classes + 4)


# Create an instance of the model
model = LightNet(num_classes=20)


In [24]:
# Step 1: Define a custom collate function
def custom_collate_fn(batch):
    images = []
    labels = []

    for item in batch:
        img, target = item  # img is the image, target is the dictionary
        images.append(img)

        # Create an empty label array for the image
        label_array = torch.zeros(20, dtype=torch.float32)  # Assuming 20 classes
        if 'object' in target['annotation']:
            for obj in target['annotation']['object']:
                class_name = obj['name']
                if class_name in class_mapping:
                    class_id = class_mapping[class_name]
                    label_array[class_id] = 1  # Mark this class as present

        labels.append(label_array)

    return torch.stack(images), torch.stack(labels)  # Stack images and labels to form batches


In [25]:
# Step 3: Define transformations for the dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])


In [26]:
# Step 4: Load the Pascal VOC dataset
root_path = 'voc-data'  # Define the root path for downloading the dataset

# Load the training and validation sets
train_set = torchvision.datasets.VOCDetection(root=root_path, year='2012', image_set='train', transform=transform, download=True)
val_set = torchvision.datasets.VOCDetection(root=root_path, year='2012', image_set='val', transform=transform, download=True)

# Create DataLoaders for the training and validation sets
# train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
# val_loader = DataLoader(val_set, batch_size=16, shuffle=False)

# Step 2: Update DataLoader to use the custom collate function
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)



Using downloaded and verified file: voc-data\VOCtrainval_11-May-2012.tar
Extracting voc-data\VOCtrainval_11-May-2012.tar to voc-data
Using downloaded and verified file: voc-data\VOCtrainval_11-May-2012.tar
Extracting voc-data\VOCtrainval_11-May-2012.tar to voc-data


In [27]:
# Step 5: Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification


In [28]:
# Step 6: Define the class mapping
class_mapping = {
    'aeroplane': 0,
    'bicycle': 1,
    'bird': 2,
    'boat': 3,
    'bottle': 4,
    'bus': 5,
    'car': 6,
    'cat': 7,
    'chair': 8,
    'cow': 9,
    'diningtable': 10,
    'dog': 11,
    'horse': 12,
    'motorbike': 13,
    'person': 14,
    'pottedplant': 15,
    'sheep': 16,
    'sofa': 17,
    'train': 18,
    'tvmonitor': 19
}


In [29]:
# Step 7: Set device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


LightNet(
  (backbone): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=65536, out_features=256, bias=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=24, bias=True)
  )
)

## Model Train and Testing

In [30]:
# Validation function
def validate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation for validation
        for images, targets in val_loader:
            images = images.to(device)
            targets = targets.to(device)

            outputs = model(images)  # Forward pass
            loss = criterion(outputs, targets)  # Calculate loss

            val_loss += loss.item() * images.size(0)  # Accumulate loss

            # Calculate predictions and compare with targets
            _, predicted = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader.dataset)
    accuracy = 100 * correct / total  # Calculate accuracy

    return val_loss, accuracy

# Step 8: Run validation after training epochs




In [31]:
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for images, targets in train_loader:
        images = images.to(device)
        targets = targets.to(device)  # Move targets to device

        optimizer.zero_grad()  # Zero gradients

        outputs = model(images)  # Forward pass

        # Compute loss (assuming outputs are [batch_size, num_classes])
        loss = criterion(outputs, targets)  # Calculate loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        train_loss += loss.item() * images.size(0)

    train_loss /= len(train_loader.dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')
    # Training code here

    
    # Run validation after each epoch
    val_loss, accuracy = validate(model, val_loader, criterion, device)
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {accuracy:.2f}%')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x50176 and 65536x256)

## implement KD using log-loss

## load pre-train model

In [32]:
import torch

# Load a pre-trained YOLOv5s model from ultralytics repository
# teacher_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
teacher_model = torch.hub.load('ultralytics/yolov5', 'custom', path='./yolov5/runs/train/exp15/weights/best.pt')
teacher_model.to(device)
# Set teacher to evaluation mode
teacher_model.eval()

# Initialize your custom lightweight student model
student_model = LightNet(num_classes=20)  # Assuming 20 classes for Pascal VOC
student_model.train()  # Set student model to training mode


Using cache found in C:\Users\Lenovo/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-10-25 Python-3.12.4 torch-2.3.1+cu118 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Fusing layers... 
Model summary: 157 layers, 7064065 parameters, 0 gradients, 15.9 GFLOPs
Adding AutoShape... 


LightNet(
  (backbone): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=65536, out_features=256, bias=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=24, bias=True)
  )
)

In [33]:
# Set device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Move the student model to the appropriate device
student_model.to(device)
device

device(type='cuda')

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim

# Custom function for knowledge distillation training
def train_knowledge_distillation(teacher, student, train_loader, epochs, learning_rate, T, soft_target_loss_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student set to training mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward pass with the teacher model - no gradients for teacher
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            # Forward pass with the student model
            student_logits = student(inputs)

            # Calculate soft targets from teacher and student logits
            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)

            # Soft target loss (distillation loss) - scaled by T**2
            soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size(0) * (T**2)

            # True label loss
            label_loss = ce_loss(student_logits, labels)

            # Total loss (weighted sum of soft target loss and true label loss)
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")


# KD Training Hyperparameters
T = 2  # Temperature for soft targets
soft_target_loss_weight = 0.25
ce_loss_weight = 0.75
learning_rate = 0.001
epochs = 10

# Execute the knowledge distillation training
train_knowledge_distillation(
    teacher=teacher_model,
    student=student_model,
    train_loader=train_loader,
    epochs=epochs,
    learning_rate=learning_rate,
    T=T,
    soft_target_loss_weight=soft_target_loss_weight,
    ce_loss_weight=ce_loss_weight,
    device=device
)

print("Knowledge distillation training completed.")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x50176 and 65536x256)

In [21]:
# Check the output size from the teacher model
with torch.no_grad():
    sample_input = torch.randn(1, 3, 224, 224).to(device)  # Adjust input size according to your model
    teacher_output = teacher_model(sample_input)
    print(f'Teacher output shape: {teacher_output.shape}')  # Check the shape

# Check the output size from the student model
student_output = student_model(sample_input)
print(f'Student output shape: {student_output.shape}')  # Check the shape

# Ensure both models predict the same number of classes
num_classes = 20  # Define the number of classes for both models
teacher_model.num_classes = num_classes  # If applicable, update your teacher model's output layer
student_model.num_classes = num_classes  # Confirm student model is already set to this

# During the KD training loss calculation, ensure soft targets have the same size:
# soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob), dim=1).mean() * (T**2)


Teacher output shape: torch.Size([1, 3087, 25])
Student output shape: torch.Size([1, 20])


In [32]:
# Training parameters
epochs = 10
learning_rate = 0.001
T = 2  # Temperature for KD
soft_target_loss_weight = 0.25
ce_loss_weight = 0.75

# Run the training loop
train_knowledge_distillation(
    teacher=teacher_model,
    student=student_model,
    train_loader=train_loader,
    epochs=epochs,
    learning_rate=learning_rate,
    T=T,
    soft_target_loss_weight=soft_target_loss_weight,
    ce_loss_weight=ce_loss_weight,
    device=device
)


RuntimeError: The size of tensor a (3087) must match the size of tensor b (20) at non-singleton dimension 1

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Step 1: Load YOLOv5 as the Teacher Model
teacher_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device)
teacher_model.eval()

Using cache found in C:\Users\Lenovo/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-10-25 Python-3.12.4 torch-2.3.1+cu118 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


AutoShape(
  (model): DetectMultiBackend(
    (model): DetectionModel(
      (model): Sequential(
        (0): Conv(
          (conv): Conv2d(3, 32, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
          (act): SiLU(inplace=True)
        )
        (1): Conv(
          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (act): SiLU(inplace=True)
        )
        (2): C3(
          (cv1): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv3): Conv(
            (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (m): Sequential(
            (0): Bottleneck(
              (cv1): Conv(
                (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
  

In [35]:
# Step 2: Define the Student Model (Lightweight CNN Model)
class StudentNet(nn.Module):
    def __init__(self, num_classes=20):
        super(StudentNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 56 * 56, 128)  # Adjust as per your image dimensions
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # Final logits

student_model = StudentNet(num_classes=20).to(device)
student_model.train()

StudentNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=200704, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=20, bias=True)
)

In [36]:
# Step 3: Define Extraction Function for Teacher Model Outputs
def extract_teacher_logits(teacher_output, num_classes=20):
    # Teacher outputs: tensor of shape [batch_size, num_detections, num_classes + 5]
    logits = teacher_output[0][:, :, 5:]  # class logits (skip first 5 for bounding box predictions)
    avg_logits = logits.mean(dim=1)  # Average across detections
    return avg_logits[:, :num_classes]  # Return only the required classes

In [37]:
# Step 4: Knowledge Distillation Loss Function
def knowledge_distillation_loss(student_logits, teacher_logits, labels, T, alpha):
    soft_teacher_probs = F.softmax(teacher_logits / T, dim=1)
    soft_student_probs = F.log_softmax(student_logits / T, dim=1)
    distillation_loss = F.kl_div(soft_student_probs, soft_teacher_probs, reduction='batchmean') * (T**2)

    ce_loss = F.cross_entropy(student_logits, labels)  # Hard target loss
    return alpha * distillation_loss + (1 - alpha) * ce_loss

In [38]:
# Step 5: Training Loop
def train_kd(teacher, student, dataloader, optimizer, epochs, T, alpha):
    for epoch in range(epochs):
        total_loss = 0
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()

            # Teacher logits (disable grad)
            with torch.no_grad():
                teacher_output = teacher(images)
                teacher_logits = extract_teacher_logits(teacher_output)

            # Student logits
            student_logits = student(images)

            # Calculate KD loss
            loss = knowledge_distillation_loss(student_logits, teacher_logits, labels, T, alpha)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}")

In [40]:
# Set up training parameters
epochs = 10
learning_rate = 0.001
T = 2.0  # Temperature
alpha = 0.5  # Distillation loss weight
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)

# Load dataset and train the KD model
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
train_set = datasets.VOCDetection(root='voc-data', year='2012', image_set='train', download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)

train_kd(teacher_model, student_model, train_loader, optimizer, epochs, T, alpha)

Using downloaded and verified file: voc-data\VOCtrainval_11-May-2012.tar
Extracting voc-data\VOCtrainval_11-May-2012.tar to voc-data


IndexError: too many indices for tensor of dimension 2

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Step 1: Load YOLOv5 as the Teacher Model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
teacher_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device)
teacher_model.eval()

# Step 2: Define the Student Model (Lightweight CNN Model)
class StudentNet(nn.Module):
    def __init__(self, num_classes=20):
        super(StudentNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 56 * 56, 128)  # Adjust as per your image dimensions
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # Final logits

student_model = StudentNet(num_classes=20).to(device)
student_model.train()

# Step 3: Define Extraction Function for Teacher Model Outputs
def extract_teacher_logits(teacher_output, num_classes=20):
    logits_list = []
    for output in teacher_output:
        # Check output shape
        if output.shape[0] == 0:  # No detections for this image
            logits_list.append(torch.zeros(1, num_classes).to(device))  # Handle empty detection
            continue
        class_logits = output[:, 5:]  # Skip first 5 elements for bounding boxes
        avg_logits = class_logits.mean(dim=0)  # Average across detections
        logits_list.append(avg_logits[:num_classes])  # Keep only the required classes
    
    # Stack logits to create a tensor [batch_size, num_classes]
    return torch.stack(logits_list)

# Step 4: Knowledge Distillation Loss Function
def knowledge_distillation_loss(student_logits, teacher_logits, labels, T, alpha):
    soft_teacher_probs = F.softmax(teacher_logits / T, dim=1)
    soft_student_probs = F.log_softmax(student_logits / T, dim=1)
    distillation_loss = F.kl_div(soft_student_probs, soft_teacher_probs, reduction='batchmean') * (T**2)

    ce_loss = F.cross_entropy(student_logits, labels)  # Hard target loss
    return alpha * distillation_loss + (1 - alpha) * ce_loss

# Step 5: Training Loop
def train_kd(teacher, student, dataloader, optimizer, epochs, T, alpha):
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()

            # Teacher logits (disable grad)
            with torch.no_grad():
                teacher_output = teacher(images)
                teacher_logits = extract_teacher_logits(teacher_output)

            # Student logits
            student_logits = student(images)

            # Calculate KD loss
            loss = knowledge_distillation_loss(student_logits, teacher_logits, labels, T, alpha)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(student_logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Print epoch results
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}, Accuracy: {100 * correct / total:.2f}%")

# Set up training parameters
epochs = 10
learning_rate = 0.001
T = 2.0  # Temperature
alpha = 0.5  # Distillation loss weight
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)

# Load dataset and train the KD model
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
train_set = datasets.VOCDetection(root='voc-data', year='2012', image_set='train', download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)

train_kd(teacher_model, student_model, train_loader, optimizer, epochs, T, alpha)


Using cache found in C:\Users\Lenovo/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-10-25 Python-3.12.4 torch-2.3.1+cu118 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Using downloaded and verified file: voc-data\VOCtrainval_11-May-2012.tar
Extracting voc-data\VOCtrainval_11-May-2012.tar to voc-data


RuntimeError: The size of tensor a (16) must match the size of tensor b (20) at non-singleton dimension 1

In [48]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np

# Step 1: Load YOLOv5 as the Teacher Model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
teacher_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device)
teacher_model.eval()

# Step 2: Define the Student Model (Lightweight CNN Model)
class StudentNet(nn.Module):
    def __init__(self, num_classes=20):
        super(StudentNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 56 * 56, 128)  # Adjust as per your image dimensions
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # Final logits

student_model = StudentNet(num_classes=20).to(device)
student_model.train()

# Step 3: Define Extraction Function for Teacher Model Outputs
def extract_teacher_logits(teacher_output, num_classes=20):
    logits_list = []
    for output in teacher_output:
        if output.shape[0] == 0:  # No detections for this image
            logits_list.append(torch.zeros(1, num_classes).to(device))  # Handle empty detection
            continue
        class_logits = output[:, 5:]  # Skip first 5 elements for bounding boxes
        avg_logits = class_logits.mean(dim=0)  # Average across detections
        logits_list.append(avg_logits[:num_classes])  # Keep only the required classes
    
    return torch.stack(logits_list)

# Step 4: Knowledge Distillation Loss Function
import torch
import torch.nn.functional as F

# Modify the knowledge distillation loss function
def knowledge_distillation_loss(student_logits, teacher_logits, labels, T, alpha):
    # Teacher softmax
    soft_teacher_probs = F.softmax(teacher_logits / T, dim=1)

    # Student softmax
    soft_student_probs = F.log_softmax(student_logits / T, dim=1)

    # Distillation loss
    distillation_loss = F.kl_div(soft_student_probs, soft_teacher_probs, reduction='batchmean') * (T**2)

    # Ensure labels are Long Tensor (for CrossEntropy)
    if labels.dim() == 1:
        ce_loss = F.cross_entropy(student_logits, labels)  # Hard target loss
    else:
        raise ValueError("Labels should be a 1D tensor of class indices")

    return alpha * distillation_loss + (1 - alpha) * ce_loss

def train_kd(teacher, student, dataloader, optimizer, epochs, T, alpha):
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        
        for images, targets in dataloader:
            images = images.to(device)
            optimizer.zero_grad()

            # Extract labels from targets
            labels = []
            for target in targets:
                if isinstance(target, dict) and 'annotation' in target:
                    if 'object' in target['annotation'] and len(target['annotation']['object']) > 0:
                        # Collect labels for detected objects
                        label_indices = [obj['name'] for obj in target['annotation']['object']]
                        class_indices = [class_mapping[name] for name in label_indices if name in class_mapping]
                        # Assuming a single label per image for simplicity, if multiple, adjust accordingly
                        if class_indices:
                            labels.append(class_indices[0])  # Take the first label
                        else:
                            labels.append(-1)  # No valid labels
                    else:
                        labels.append(-1)  # No detections
                else:
                    labels.append(-1)  # Invalid target
                
            # Convert labels to tensor and filter out invalid labels
            labels = torch.tensor([label for label in labels if label != -1]).to(device)  # Only keep valid indices
            labels = labels.long()  # Ensure labels are Long Tensor

            # Pad labels to match batch size if necessary
            if labels.size(0) < images.size(0):
                # Pad with a default value, e.g., -1 (not a valid class index)
                padding_size = images.size(0) - labels.size(0)
                labels = F.pad(labels, (0, padding_size), value=-1)  # Pad on the right

            # Teacher logits (disable grad)
            with torch.no_grad():
                teacher_output = teacher(images)
                teacher_logits = extract_teacher_logits(teacher_output)

            # Student logits
            student_logits = student(images)

            # Calculate KD loss
            loss = knowledge_distillation_loss(student_logits, teacher_logits, labels, T, alpha)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Calculate accuracy (only consider valid labels)
            _, predicted = torch.max(student_logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Print epoch results
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}, Accuracy: {100 * correct / total:.2f}%")




# Set up training parameters
epochs = 10
learning_rate = 0.001
T = 2.0  # Temperature
alpha = 0.5  # Distillation loss weight
optimizer = optim.Adam(student_model.parameters(), lr=learning_rate)

# Load dataset and train the KD model
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
train_set = datasets.VOCDetection(root='voc-data', year='2012', image_set='train', download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)

train_kd(teacher_model, student_model, train_loader, optimizer, epochs, T, alpha)


Using cache found in C:\Users\Lenovo/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-10-25 Python-3.12.4 torch-2.3.1+cu118 CUDA:0 (NVIDIA GeForce RTX 4050 Laptop GPU, 6140MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


Using downloaded and verified file: voc-data\VOCtrainval_11-May-2012.tar
Extracting voc-data\VOCtrainval_11-May-2012.tar to voc-data


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [49]:
print("Student logits shape:", student_logits.shape)
print("Teacher logits shape:", teacher_logits.shape)
print("Labels shape:", labels.shape)


NameError: name 'student_logits' is not defined