<a href="https://colab.research.google.com/github/MdSourav76046/DNN_4.2/blob/main/2010776146_assignment7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
pip install ultralytics opencv-python



In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mksaad/wider-face-a-face-detection-benchmark")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/wider-face-a-face-detection-benchmark


In [6]:
# === YOLOv1 Face Detector Full Implementation ===

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np
import cv2
import glob
from tqdm import tqdm
# === Config ===
S = 7  # grid size
B = 2  # number of bounding boxes
C = 1  # number of classes (face)
IMG_SIZE = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# === Paths ===
ANNOTATION_DIR = "/root/.cache/kagglehub/datasets/mksaad/wider-face-a-face-detection-benchmark/versions/4/wider_face_split/wider_face_split"
TRAIN_IMG_DIR = "/root/.cache/kagglehub/datasets/mksaad/wider-face-a-face-detection-benchmark/versions/4/WIDER_train/WIDER_train/images"
VAL_IMG_DIR = "/root/.cache/kagglehub/datasets/mksaad/wider-face-a-face-detection-benchmark/versions/4/WIDER_val/WIDER_val/images"

TRAIN_ANNOTATION_FILE = os.path.join(ANNOTATION_DIR, "wider_face_train_bbx_gt.txt")
VAL_ANNOTATION_FILE = os.path.join(ANNOTATION_DIR, "wider_face_val_bbx_gt.txt")

# === Dataset storage ===
YOLOv1_DATASET = {
    "train": [],
    "val": []
}

def convert_for_yolov1(txt_path, image_root, split):
    with open(txt_path, 'r') as f:
        lines = f.readlines()

    idx = 0
    total_images = 0

    while idx < len(lines):
        rel_image_path = lines[idx].strip()
        face_count = int(lines[idx + 1].strip())
        full_image_path = os.path.join(image_root, rel_image_path)

        if not os.path.exists(full_image_path):
            idx += 2 + face_count
            continue

        img = cv2.imread(full_image_path)
        if img is None:
            idx += 2 + face_count
            continue

        h, w = img.shape[:2]
        label_matrix = torch.zeros((S, S, 5 + C))

        for i in range(face_count):
            x, y, bw, bh = map(int, lines[idx + 2 + i].strip().split()[:4])

            xc = (x + bw / 2) / w
            yc = (y + bh / 2) / h
            nw = bw / w
            nh = bh / h

            i_cell, j_cell = int(yc * S), int(xc * S)
            if i_cell >= S or j_cell >= S:
                continue

            x_cell = xc * S - j_cell
            y_cell = yc * S - i_cell

            if label_matrix[i_cell, j_cell, 0] == 0:
                label_matrix[i_cell, j_cell, 0:5] = torch.tensor([1, x_cell, y_cell, nw, nh])
                label_matrix[i_cell, j_cell, 5] = 1  # class one-hot (face)

        YOLOv1_DATASET[split].append((full_image_path, label_matrix))
        total_images += 1
        idx += 2 + face_count

    print(f" Converted {total_images} images for split '{split}'.")

# === Run conversion ===
convert_for_yolov1(TRAIN_ANNOTATION_FILE, TRAIN_IMG_DIR, "train")
convert_for_yolov1(VAL_ANNOTATION_FILE, VAL_IMG_DIR, "val")

# === Dataset Class ===
class YOLOv1Dataset(Dataset):
    def __init__(self, dataset_list, transform=None):
        self.data = dataset_list
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label_matrix = self.data[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))

        if self.transform:
            img = self.transform(img)
        else:
            img = torch.from_numpy(img.transpose(2, 0, 1)).float() / 255.0

        return img, label_matrix

# === Example: Create Dataloaders ===
train_dataset = YOLOv1Dataset(YOLOv1_DATASET['train'])
val_dataset = YOLOv1Dataset(YOLOv1_DATASET['val'])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

class YOLOv1(nn.Module):
    def __init__(self, S=7, B=2, C=1):  # C=1 for face class
        super(YOLOv1, self).__init__()
        self.S = S  # grid size
        self.B = B  # number of bounding boxes per grid
        self.C = C  # number of classes (1 for face)

        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),

            # Block 2
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),

            # Block 3
            nn.Conv2d(192, 128, kernel_size=1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 256, kernel_size=1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),

            # 4 conv blocks (all maintaining 512 channels)
            *[nn.Sequential(
                nn.Conv2d(512, 256, kernel_size=1),
                nn.LeakyReLU(0.1),
                nn.Conv2d(256, 512, kernel_size=3, padding=1),
                nn.LeakyReLU(0.1)
            ) for _ in range(4)],

            # Transition to 1024 channels
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2),

            # Final blocks (all 1024 channels)
            nn.Conv2d(1024, 512, kernel_size=1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.1),

            # Last two conv layers
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.LeakyReLU(0.1),
            nn.AdaptiveAvgPool2d((self.S, self.S))
        )

        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(4096, S * S * (C + B * 5))  # final output
        )

    def forward(self, x):
        x = self.features(x)
        x = self.fc(x)
        return x.view(-1, self.S, self.S, self.C + self.B * 5)


# === Loss Function (Simplified YOLO Loss) ===
class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=1):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction='sum')
        self.S, self.B, self.C = S, B, C
        self.lambda_coord = 5
        self.lambda_noobj = 0.5

    def forward(self, preds, target):
        obj = target[..., 4].unsqueeze(-1)

        coord_loss = self.lambda_coord * self.mse(preds[..., 0:2] * obj, target[..., 0:2] * obj)
        size_loss = self.lambda_coord * self.mse(preds[..., 2:4] * obj, target[..., 2:4] * obj)
        conf_loss = self.mse(preds[..., 4:5] * obj, target[..., 4:5] * obj) + \
                    self.lambda_noobj * self.mse(preds[..., 4:5] * (1 - obj), target[..., 4:5] * (1 - obj))
        class_loss = self.mse(preds[..., 5:] * obj, target[..., 5:] * obj)

        return coord_loss + size_loss + conf_loss + class_loss

# === Training Loop ===
def train(model, dataloader, optimizer, criterion, epochs=30):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            preds = model(imgs)
            loss = criterion(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}")

# === Run Training ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = YOLOv1().to(device)

criterion = YoloLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(10):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{10}]")
    for imgs, targets in loop:
        imgs, targets = imgs.to(DEVICE), targets.to(DEVICE)

        preds = model(imgs)
        loss = criterion(preds, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

# === Save Model ===
torch.save(model.state_dict(), "yolov1_face.pt")
print("✅ Model saved as 'yolov1_face.pt'")


 Converted 12880 images for split 'train'.
 Converted 3226 images for split 'val'.


Epoch [1/10]: 100%|██████████| 805/805 [03:53<00:00,  3.45it/s, loss=6.18e+13]
Epoch [2/10]: 100%|██████████| 805/805 [03:52<00:00,  3.45it/s, loss=2.26e+13]
Epoch [3/10]: 100%|██████████| 805/805 [03:51<00:00,  3.47it/s, loss=9.39e+12]
Epoch [4/10]: 100%|██████████| 805/805 [03:51<00:00,  3.47it/s, loss=4.76e+12]
Epoch [5/10]: 100%|██████████| 805/805 [03:52<00:00,  3.46it/s, loss=2.8e+12]
Epoch [6/10]: 100%|██████████| 805/805 [03:51<00:00,  3.48it/s, loss=1.29e+12]
Epoch [7/10]: 100%|██████████| 805/805 [03:51<00:00,  3.48it/s, loss=9.81e+11]
Epoch [8/10]: 100%|██████████| 805/805 [03:52<00:00,  3.47it/s, loss=4.61e+11]
Epoch [9/10]: 100%|██████████| 805/805 [03:53<00:00,  3.45it/s, loss=2.95e+11]
Epoch [10/10]: 100%|██████████| 805/805 [03:52<00:00,  3.47it/s, loss=1.66e+11]


✅ Model saved as 'yolov1_face.pt'
