In [1]:
# Step 1: Import Required Libraries
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


In [3]:
# Step 2: Define a lightweight CNN model
class LightNet(nn.Module):
    def __init__(self, num_classes=20):
        super(LightNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 56 * 56, 128)  # Adjust according to your input image size
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.max_pool2d(x, 2)
        x = nn.functional.relu(self.conv2(x))
        x = nn.functional.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Create an instance of the model
model = LightNet(num_classes=20)


In [4]:
# Step 1: Define a custom collate function
def custom_collate_fn(batch):
    images = []
    labels = []

    for item in batch:
        img, target = item  # img is the image, target is the dictionary
        images.append(img)

        # Create an empty label array for the image
        label_array = torch.zeros(20, dtype=torch.float32)  # Assuming 20 classes
        if 'object' in target['annotation']:
            for obj in target['annotation']['object']:
                class_name = obj['name']
                if class_name in class_mapping:
                    class_id = class_mapping[class_name]
                    label_array[class_id] = 1  # Mark this class as present

        labels.append(label_array)

    return torch.stack(images), torch.stack(labels)  # Stack images and labels to form batches


In [5]:
# Step 3: Define transformations for the dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])


In [6]:
# Step 4: Load the Pascal VOC dataset
root_path = 'voc-data'  # Define the root path for downloading the dataset

# Load the training and validation sets
train_set = torchvision.datasets.VOCDetection(root=root_path, year='2012', image_set='train', transform=transform, download=True)
val_set = torchvision.datasets.VOCDetection(root=root_path, year='2012', image_set='val', transform=transform, download=True)

# Create DataLoaders for the training and validation sets
# train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
# val_loader = DataLoader(val_set, batch_size=16, shuffle=False)

# Step 2: Update DataLoader to use the custom collate function
train_loader = DataLoader(train_set, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)



Using downloaded and verified file: voc-data\VOCtrainval_11-May-2012.tar
Extracting voc-data\VOCtrainval_11-May-2012.tar to voc-data
Using downloaded and verified file: voc-data\VOCtrainval_11-May-2012.tar
Extracting voc-data\VOCtrainval_11-May-2012.tar to voc-data


In [7]:
# Step 5: Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification


In [8]:
# Step 6: Define the class mapping
class_mapping = {
    'aeroplane': 0,
    'bicycle': 1,
    'bird': 2,
    'boat': 3,
    'bottle': 4,
    'bus': 5,
    'car': 6,
    'cat': 7,
    'chair': 8,
    'cow': 9,
    'diningtable': 10,
    'dog': 11,
    'horse': 12,
    'motorbike': 13,
    'person': 14,
    'pottedplant': 15,
    'sheep': 16,
    'sofa': 17,
    'train': 18,
    'tvmonitor': 19
}


In [9]:
# Step 7: Set device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


LightNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=200704, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=20, bias=True)
)

## Model Train and Testing

In [12]:
# Validation function
def validate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation for validation
        for images, targets in val_loader:
            images = images.to(device)
            targets = targets.to(device)

            outputs = model(images)  # Forward pass
            loss = criterion(outputs, targets)  # Calculate loss

            val_loss += loss.item() * images.size(0)  # Accumulate loss

            # Calculate predictions and compare with targets
            _, predicted = torch.max(outputs, 1)
            _, labels = torch.max(targets, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader.dataset)
    accuracy = 100 * correct / total  # Calculate accuracy

    return val_loss, accuracy

# Step 8: Run validation after training epochs




In [14]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for images, targets in train_loader:
        images = images.to(device)
        targets = targets.to(device)  # Move targets to device

        optimizer.zero_grad()  # Zero gradients

        outputs = model(images)  # Forward pass

        # Compute loss (assuming outputs are [batch_size, num_classes])
        loss = criterion(outputs, targets)  # Calculate loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        train_loss += loss.item() * images.size(0)

    train_loss /= len(train_loader.dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')
    # Training code here

    
    # Run validation after each epoch
    val_loss, accuracy = validate(model, val_loader, criterion, device)
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {accuracy:.2f}%')

Epoch [1/10], Loss: 3.4214
Epoch [1/10], Validation Loss: 3.7693, Validation Accuracy: 20.80%
Epoch [2/10], Loss: 2.6927
Epoch [2/10], Validation Loss: 4.0838, Validation Accuracy: 20.57%
Epoch [3/10], Loss: 2.0619
Epoch [3/10], Validation Loss: 4.5623, Validation Accuracy: 20.59%
Epoch [4/10], Loss: 1.7542
Epoch [4/10], Validation Loss: 4.5669, Validation Accuracy: 19.54%
Epoch [5/10], Loss: 1.5765
Epoch [5/10], Validation Loss: 5.0700, Validation Accuracy: 19.23%
Epoch [6/10], Loss: 1.4667
Epoch [6/10], Validation Loss: 4.8686, Validation Accuracy: 18.62%
Epoch [7/10], Loss: 1.3611
Epoch [7/10], Validation Loss: 4.7417, Validation Accuracy: 19.94%
Epoch [8/10], Loss: 1.3345
Epoch [8/10], Validation Loss: 5.1274, Validation Accuracy: 19.32%
Epoch [9/10], Loss: 1.2849
Epoch [9/10], Validation Loss: 5.0594, Validation Accuracy: 18.91%
Epoch [10/10], Loss: 1.2660
Epoch [10/10], Validation Loss: 4.8336, Validation Accuracy: 19.58%
