In [7]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import cv2
import numpy as np


# Dataset Class for WIDERFACE
class WiderFaceDataset(Dataset):
    def __init__(self, images_dir, annotations_dir, transform=None, img_size=(208, 208)):
        self.images_dir = images_dir
        self.annotations_dir = annotations_dir
        self.transform = transform
        self.img_size = img_size  # Default is 416x416
        self.image_paths = []
        self.annotations = []

        # List all image files in the images directory
        image_files = [f for f in os.listdir(images_dir) if f.endswith('.jpg') or f.endswith('.png')]

        for image_file in image_files:
            image_path = os.path.join(images_dir, image_file)
            label_file = os.path.splitext(image_file)[0] + '.txt'  # Match the image name with label file
            label_path = os.path.join(annotations_dir, label_file)

            if os.path.exists(label_path):
                with open(label_path, 'r') as f:
                    lines = f.readlines()

                # Process each annotation in the file
                boxes = []
                for line in lines:
                    parts = line.split()
                    num_faces = int(parts[0])

                    for i in range(num_faces):
                        x_center = float(parts[2 + 4*i])  # normalized x-center
                        y_center = float(parts[3 + 4*i])  # normalized y-center
                        width = float(parts[4 + 4*i])     # normalized width
                        height = float(parts[5 + 4*i])    # normalized height
                        boxes.append([x_center, y_center, width, height])

                self.image_paths.append(image_path)
                self.annotations.append(boxes)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = cv2.imread(self.image_paths[idx])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Resize image to 416x416 for YOLOv1 input
        image = cv2.resize(image, self.img_size)

        # Convert to float32 and normalize to range [0, 1]
        image = image.astype(np.float32) / 255.0

        # Convert image to a tensor
        image = torch.tensor(image).permute(2, 0, 1)  # Convert to CxHxW format

        boxes = self.annotations[idx]

        # Normalize boxes to grid size (YOLOv1 uses 13x13 grid for 416x416 input)
        grid_size = 13
        target = np.zeros((grid_size, grid_size, 5))  # 5: 4 for box + 1 for confidence

        for box in boxes:
            x_center, y_center, width, height = box
            grid_x = int(x_center * grid_size)
            grid_y = int(y_center * grid_size)

            target[grid_y, grid_x, 0] = x_center  # Box center X
            target[grid_y, grid_x, 1] = y_center  # Box center Y
            target[grid_y, grid_x, 2] = width     # Width
            target[grid_y, grid_x, 3] = height    # Height
            target[grid_y, grid_x, 4] = 1         # Confidence (since we have faces)

        # Convert target to tensor with correct dtype
        target = torch.tensor(target, dtype=torch.float32)

        return image, target


# YOLOv1 Architecture
class YOLOv1(nn.Module):
    def __init__(self, img_size=416, grid_size=13):
        super(YOLOv1, self).__init__()

        self.img_size = img_size
        self.grid_size = grid_size  # Grid size (e.g., 13x13)

        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 64, 7, stride=2, padding=3)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(64, 192, 3, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(192, 128, 1, stride=1, padding=0)
        self.conv4 = nn.Conv2d(128, 256, 3, stride=1, padding=1)
        self.pool3 = nn.MaxPool2d(2, 2)
        self.conv5 = nn.Conv2d(256, 512, 3, stride=1, padding=1)

        # Calculate the size of the output after all convolutional and pooling layers
        self._initialize_conv_output_size()

        # Fully connected layers
        self.fc1 = nn.Linear(self.flattened_size, 4096)  # Dynamic flattened size
        self.fc2 = nn.Linear(4096, self.grid_size * self.grid_size * 5)  # Output for 13x13 grid with 5 values (box + confidence)

    def _initialize_conv_output_size(self):
        # Create a dummy input to calculate the output size after convolutional layers
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, self.img_size, self.img_size)  # Batch size of 1, RGB image
            dummy_output = self._forward_conv(dummy_input)
            self.flattened_size = dummy_output.numel()

    def _forward_conv(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool3(x)
        x = F.relu(self.conv5(x))
        return x

    def forward(self, x):
        # Pass through convolutional layers
        x = self._forward_conv(x)

        # Flatten the output for the fully connected layers
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        # Reshape to match the grid size (batch_size, 13, 13, 5)
        x = x.view(-1, self.grid_size, self.grid_size, 5)

        return x


# Training Setup (CPU Only)
device_cpu = torch.device('cpu')  # Use only CPU

# Hyperparameters
batch_size = 2  # Reduced batch size
epochs = 10
learning_rate = 1e-4

# Dataset Paths
train_images_dir = '/home/cse/Documents/MdSourav/DNN/Assignment7/WIDER Face Dataset/split_data/train/images'
train_annotations_dir = '/home/cse/Documents/MdSourav/DNN/Assignment7/WIDER Face Dataset/split_data/train/labels'
val_images_dir = '/home/cse/Documents/MdSourav/DNN/Assignment7/WIDER Face Dataset/split_data/val/images'
val_annotations_dir = '/home/cse/Documents/MdSourav/DNN/Assignment7/WIDER Face Dataset/split_data/val/labels'

# Create Datasets and DataLoaders
train_dataset = WiderFaceDataset(
    images_dir=train_images_dir, 
    annotations_dir=train_annotations_dir, 
    img_size=(208, 208)  # Resize to 208x208
)
val_dataset = WiderFaceDataset(
    images_dir=val_images_dir, 
    annotations_dir=val_annotations_dir, 
    img_size=(208, 208)
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, loss function, and optimizer
model = YOLOv1()
model = model.to(device_cpu)  # Store model on CPU

criterion = nn.MSELoss()  # Using MSELoss for simplicity (you can implement a custom YOLO loss function)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for images, targets in train_loader:
        images = images.to(device_cpu)  # Keep images on CPU for training
        targets = targets.to(device_cpu)  # Keep targets on CPU for training

        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")

# Save the trained model
torch.save(model.state_dict(), 'yolov1_face_detector_cpu.pth')
print("Model saved!")

# Evaluate the model on validation set (Optional)
model.eval()
with torch.no_grad():
    val_loss = 0.0
    for images, targets in val_loader:
        images = images.to(device_cpu)  # Keep images on CPU for evaluation
        targets = targets.to(device_cpu)  # Keep targets on CPU for evaluation

        outputs = model(images)
        loss = criterion(outputs, targets)
        val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x86528 and 346112x4096)