In [None]:
import os
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from PIL import Image, ImageDraw
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import random

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device used ", DEVICE)
CLASSES = ['__background__', 'pothole']
NUM_CLASSES = len(CLASSES)  # 2

# 1 Model Initialization

In [None]:
class PotholeDataset(Dataset):
    def __init__(self, csv_file, image_dir, limit=None):
        self.df = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.image_files = self.df['filename'].unique()
        if limit:
            self.image_files = self.image_files[:limit]
        self.transform = transforms.Compose([
            transforms.Resize((640, 640)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        fn = self.image_files[idx]
        path = os.path.join(self.image_dir, fn)
        img = Image.open(path).convert("RGB")
        img = img.resize((640, 640))
        w, h = img.size
        
        img_tensor = transforms.ToTensor()(img)

        rec = self.df[self.df['filename'] == fn]
        boxes = rec[['xmin','ymin','xmax','ymax']].values.astype(np.float32)

        boxes[:, [0,2]] /= w   # x coords
        boxes[:, [1,3]] /= h   # y coords

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.ones(len(rec), dtype=torch.int64)

        return img_tensor, {'boxes': boxes, 'labels': labels}


In [128]:
def get_loaders(batch_size=4, train_limit=None, valid_limit=100, test_limit=100):
    datasets = {
        'train': PotholeDataset("dataset/train/_annotations.csv", "dataset/train/images", limit=train_limit),
        'valid': PotholeDataset("dataset/valid/_annotations.csv", "dataset/valid/images", limit=valid_limit),
        'test':  PotholeDataset("dataset/test/_annotations.csv", "dataset/test/images", limit=test_limit),
    }

    loaders = {
        split: DataLoader(datasets[split], batch_size=batch_size, shuffle=(split == 'train' or split == 'valid' or split == 'test'),
                          collate_fn=lambda x: tuple(zip(*x)))
        for split in datasets
    } 
    return loaders

In [None]:
class SimpleObjectDetector(nn.Module):
    def __init__(self, num_preds=20):
        super().__init__()
        self.num_preds = num_preds
        self.backbone = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),   # 640→320
            nn.Conv2d(32,64,3, padding=1), nn.ReLU(), nn.MaxPool2d(2),    # 320→160
            nn.Conv2d(64,128,3,padding=1), nn.ReLU(), nn.MaxPool2d(2),    # 160→80
            nn.Conv2d(128,256,3,padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool2d((1,1))                                 
        )
        self.head = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 512), nn.ReLU(),
            nn.Linear(512, num_preds * 5) 
        )

    def forward(self, x):
        x = self.backbone(x)               
        x = self.head(x)                  
        return x.view(-1, self.num_preds, 5)

In [None]:
transform = transforms.Compose([
        transforms.Resize((640,640)),
        transforms.ToTensor()
    ])

base = "dataset"
train_csv = os.path.join(base, "train", "_annotations.csv")
val_csv   = os.path.join(base, "valid",   "_annotations.csv")
test_csv  = os.path.join(base, "test",  "_annotations.csv")
train_dir = os.path.join(base, "train", "images")
val_dir   = os.path.join(base, "valid",   "images")
test_dir  = os.path.join(base, "test",  "images")

train_ds = PotholeDataset("dataset/train/_annotations.csv", "dataset/train/images", limit=None),
val_ds   = PotholeDataset("dataset/valid/_annotations.csv", "dataset/valid/images", limit=None)
test_ds  = PotholeDataset("dataset/test/_annotations.csv", "dataset/test/images", limit=None)

# 2 Model Training

In [None]:
def detection_loss(preds, targets, λ_box=5.0, λ_conf=1.0):
    B, P, _ = preds.shape
    pred_boxes = preds[..., :4]
    pred_conf_logits = preds[..., 4]

    padded_boxes, padded_conf = [], []
    for t in targets:
        gt = t['boxes'].to(preds.device)
        M = gt.size(0)
        if M < P:
            pad_b = torch.zeros((P-M,4), device=gt.device)
            boxes = torch.cat([gt, pad_b], dim=0)
            conf  = torch.cat([torch.ones(M,device=gt.device),
                               torch.zeros(P-M,device=gt.device)])
        else:
            boxes = gt[:P]
            conf  = torch.ones(P, device=gt.device)

        padded_boxes.append(boxes)
        padded_conf.append(conf)
        
    true_boxes = torch.stack(padded_boxes)
    true_conf  = torch.stack(padded_conf)
    loss_box  = F.smooth_l1_loss(pred_boxes, true_boxes)
    loss_conf = F.binary_cross_entropy_with_logits(pred_conf_logits, true_conf)
    return λ_box * loss_box + λ_conf * loss_conf


def filter_predictions(pred, conf_thresh=0.3):
    boxes  = pred[:, :4]
    scores = torch.sigmoid(pred[:, 4])
    mask   = scores > conf_thresh

    return boxes[mask], scores[mask]

In [None]:
def train_custom(model, loader, optimizer, device, epochs=10):
    train_losses = []
    model.to(device).train()
    for ep in range(epochs):
        running = 0.0
        for imgs, targets in loader:
            imgs = torch.stack([img.to(device) for img in imgs])
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            preds = model(imgs)
            loss = detection_loss(preds, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running += loss.item()
        avg = running / len(loader)
        train_losses.append(avg)
        print(f"[Custom] Epoch {ep+1}/{epochs} loss: {avg:.4f}")
    return train_losses


In [None]:
loaders   = get_loaders(batch_size=4, train_limit=500)
model     = SimpleObjectDetector(num_preds=20).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
epochs = 5
cnn_losses = train_custom(model, loaders['train'], optimizer, DEVICE, epochs=epochs)

In [None]:

import cv2
import matplotlib.pyplot as plt
def visualizeFasterCustomCNNredictions(
    model, dataloader, device, num_images=10, score_threshold=0.85,
    output_pdf_path="predicted_vs_gt_custom_cnn.pdf"
):
    if os.path.exists(output_pdf_path):
        os.remove(output_pdf_path)
        print("File deleted.")

    model.eval()

    images_visualized = 0
    with PdfPages(output_pdf_path) as pdf:
        with torch.no_grad():
            for imgs, targets in dataloader:
                imgs = torch.stack([img.to(DEVICE) for img in imgs])
                targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
                preds = model(imgs)
                imgs = list(img.to(device) for img in imgs)
                for i in range(len(preds)):

                    pb, ps = filter_predictions(preds[i], conf_thresh=0.3)
                    pb[:, [0,2]] *= 640
                    pb[:, [1,3]] *= 640
                    img_np = imgs[i].permute(1, 2, 0).cpu().numpy()
                    fig, axes = plt.subplots(1, 2, figsize=(14, 7))

                    axes[0].imshow(img_np)
                    axes[0].set_title("Ground Truth")
                    for box in targets[i]['boxes']:
                        x1, y1, x2, y2 = [coord.item()*640 for coord in box]
                        axes[0].add_patch(plt.Rectangle(
                            (x1, y1), x2 - x1, y2 - y1,
                            edgecolor='green', fill=False, linewidth=0.5
                        ))
                    axes[0].axis('off')

                    # prediction
                    axes[1].imshow(img_np)
                    axes[1].set_title("Predictions")
                    for box in pb:
                        print("---box:", box)
                        x1, y1, x2, y2 = map(float, box)
                        axes[1].add_patch(plt.Rectangle(
                            (x1, y1), x2 - x1, y2 - y1,
                            edgecolor='red', fill=False, linewidth=0.5
                        ))
                    axes[1].axis('off')

                    pdf.savefig(fig)
                    plt.close(fig)

                images_visualized += 1
                if images_visualized >= num_images:
                    plt.figure(figsize=(8, 5))
                    plt.plot([ep+1 for ep in range(epochs)], cnn_losses, label='Box Loss', color='blue')
                    plt.xlabel("Epoch")
                    plt.ylabel("Loss")
                    plt.title("Training Box Loss")
                    plt.legend()
                    plt.grid(True)
                    pdf.savefig()
                    plt.close()

                    plt.figure(figsize=(8, 5))
                    plt.axis('off')
                    plt.title("Evaluation Metrics", fontsize=14)
                    plt.text(0, 0.8, metrics_text, fontsize=12, verticalalignment='top')
                    pdf.savefig()
                    plt.close()
                    print(f"PDF report saved as {output_pdf_path}")
                    return


In [135]:

visualizeFasterCustomCNNredictions(model, loaders['valid'], DEVICE, output_pdf_path="predicted_vs_gt_custom_cnn.pdf", num_images=10)

File deleted.
__________________________________
---box: tensor([304.7130, 360.6023, 386.9023, 422.9527], device='cuda:0')
---box: tensor([173.5348, 226.7359, 190.5000, 235.4550], device='cuda:0')
__________________________________
---box: tensor([294.8817, 347.2666, 374.5795, 407.1741], device='cuda:0')
---box: tensor([166.9530, 217.5149, 184.4905, 226.9887], device='cuda:0')
__________________________________
---box: tensor([302.0647, 356.9124, 383.3731, 418.3499], device='cuda:0')
---box: tensor([171.6808, 224.0974, 188.7456, 233.0272], device='cuda:0')
__________________________________
---box: tensor([307.3283, 364.1050, 390.0472, 426.9258], device='cuda:0')
---box: tensor([175.2378, 229.1114, 191.9358, 237.6913], device='cuda:0')
---images_visualized: 1
__________________________________
---box: tensor([298.2651, 351.8326, 378.7741, 412.5595], device='cuda:0')
---box: tensor([169.2688, 220.6312, 186.4254, 229.9137], device='cuda:0')
__________________________________
---box: tens