In [145]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [146]:
img_size=[656,208]
device = torch.device('cuda')

In [147]:
anchors=torch.tensor([[0.026687,  0.048969 ],
 [0.106309,  0.146625 ],
 [0.194006 , 0.330219 ],
 [0.057043  ,0.0817975],
 [0.009562,  0.243062 ],
 [0.041746 , 0.255281 ]]).to(device)


In [148]:
anchors.shape


torch.Size([6, 2])

In [149]:
import timm
darknet53 = timm.create_model('darknet53', pretrained=False, features_only=True)

In [150]:
darknet53.load_state_dict(torch.load("darknet53_feat.pth"))

  darknet53.load_state_dict(torch.load("darknet53_feat.pth"))


<All keys matched successfully>

In [151]:
class YOLO(nn.Module):
    def __init__(self,num_classes=9,anchor=6,freeze_backbone=True):
        super(YOLO, self).__init__()
        self.prediction=anchor*(num_classes+5)
        self.backbone=darknet53    
        
        if freeze_backbone:
            for param in self.backbone.parameters():
                param.requires_grad = False
        self.ml_pred=nn.Conv2d(256, self.prediction, kernel_size=1, stride=1)
        self.dark_conv=nn.Conv2d(512, 256, kernel_size=1, stride=1)

    def forward(self, x):
        
        out=self.backbone.forward(x)
        out=(self.dark_conv(out[4]))
        return torch.sigmoid(self.ml_pred(out))
    

In [152]:
x=torch.randn(1, 3, 656, 208)
model=YOLO(num_classes=9)
print(model(x).shape)

torch.Size([1, 84, 41, 13])


In [153]:
def anchor_iou(anchors,targets,img_size=[656,208]):
    W, H = img_size

    # Convert normalized sizes to pixel space
    target_w = (targets[0] * W)
    target_h = (targets[1] * H)

    anchor_w = (anchors[:, 0] * W).unsqueeze(1)
    anchor_h = (anchors[:, 1] * H).unsqueeze(1)
    
    # print(target_w.shape,anchor_w.shape)
    inter_w = torch.min(target_w, anchor_w)
    inter_h = torch.min(target_h, anchor_h)
   
    inter_area = inter_w * inter_h
    
    union_area = (target_w * target_h) + (anchor_w * anchor_h)- inter_area        

    iou = inter_area / (union_area + 1e-6)
    best_anchor = torch.argmax(iou, dim=0)

    return best_anchor

In [154]:
ta=torch.tensor([5,10])
ta.shape

torch.Size([2])

In [155]:
def process_labels(target,batch_size,num_prediction=9):
    target = torch.tensor(target.to(device), dtype=torch.float32)  # shape: (N, 6)
    b, cls, x1, y1, x2, y2 = target.T

    # Initialize once
    batch_target = torch.zeros((batch_size, 6,14, 41,13), dtype=torch.float32)

    # Vectorize computation of coordinates
    W, H = img_size
    cell_w,cell_h=[656/41,208/13]
    cx, cy = (x1 + x2) / 2, (y1 + y2) / 2
    grid_x, grid_y = (cx / cell_w), (cy / cell_h)
    tx, ty = (cx / cell_w)-grid_x, ((cy / cell_h))-grid_y
    w, h = (x2 - x1) / W, (y2 - y1) / H

    best_anchor = anchor_iou(anchors, torch.tensor([[w,h]]).to(device))
    
    # Assign everything in a batched manner
    # # print(b)

    
    tw = torch.log((w / (anchors[best_anchor].permute(1,0)[0])) + 1e-6)
    th = torch.log((h / (anchors[best_anchor].permute(1,0)[1])) + 1e-6)

    batch_target[b-1, best_anchor, 0, grid_x, grid_y] = 1
    batch_target[b-1, best_anchor,1, grid_x, grid_y] = tx
    batch_target[b-1, best_anchor,2, grid_x, grid_y] = ty
    batch_target[b-1, best_anchor,3, grid_x, grid_y] = tw
    batch_target[b-1, best_anchor,4, grid_x, grid_y] = th

    # One-hot for class predictions
    for i in range(len(cls)):
        batch_target[int(b[i]),best_anchor, 5 + int(cls[i]), grid_x[i], grid_y[i]] = 1
    # print(batch_target.shape)
    return batch_target

In [156]:
def bbox_iou(box1, box2,obj_idx,anchors, eps=1e-7):
    
    tpx, tpy, tpw, tph = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
    tx, ty, tw, th = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
    W,H=img_size
    pw = anchors[obj_idx[:, 1],0]  # assuming obj_idx[:,2] gives anchor index
    ph = anchors[obj_idx[:, 1],1]
    w = pw * torch.exp(tpw)
    h = ph * torch.exp(tph)
    w*=W
    h*=H

    cpx=torch.sigmoid(tpx)*16 + obj_idx[:,2]*16
    cpy=torch.sigmoid(tpy)*16 + obj_idx[:,3]*16
    

    target_w = pw * torch.exp(tw)*W
    target_h = ph * torch.exp(th)*H
    target_cx = (tx + obj_idx[:,2]) * 16
    target_cy = (ty + obj_idx[:,3]) * 16

    box1 =  [cpx-w/2, cpy-h/2, cpx+w/2, cpy+h/2]
    box2 =  [target_cx-target_w/2, target_cy-target_h/2, target_cx+target_w/2, target_cy+target_h/2]

    xA = torch.max(box1[0], box2[0])
    yA = torch.max(box1[1], box2[1])

    xB = torch.min(box1[2], box2[2])
    yB = torch.min(box1[3], box2[3])
    
    # union
    area_intersection = torch.max(xB-xA, torch.zeros(xB.shape, dtype=xB.dtype, device=xB.device)) * torch.max(yB-yA, torch.zeros(yB.shape, dtype=yB.dtype, device=yB.device))

    area_union = (box1[2]-box1[0]) * (box1[3]-box1[1]) + (box2[2]-box2[0]) * (box2[3]-box2[1]) - area_intersection +eps

    iou = torch.clamp(area_intersection / (area_union ), 0, 1)
    return iou

In [157]:
def yolo_loss(output, target,anchors, num_prediction=9):
    batch_size=output.shape[0]
    new_target=torch.tensor(process_labels(target,batch_size)).to(device) #-->(B,6,5+num_pred,41,13)
    # new_target=torch.randn([1,6,14,41,13])
    print(new_target.shape)
    object_idx=new_target[:,:,0,:,:].nonzero()
    print(object_idx.shape)
    no_object_idx=(new_target[:,:,0,:,:]==0).nonzero()
    new_output=output.view(batch_size,6,14,41,13)
    
    

    target_obj=new_target[object_idx[:,0],object_idx[:,1],:,object_idx[:,2],object_idx[:,3]]
    predicted_object = new_output[object_idx[:,0],object_idx[:,1],:,object_idx[:,2],object_idx[:,3]]
    
    print(predicted_object.shape)
    iou=bbox_iou(predicted_object[:,1:5],target_obj[:,1:5],object_idx,anchors)

    predicted_no_object = new_output[no_object_idx[:,0],no_object_idx[:,1],:,no_object_idx[:,2],no_object_idx[:,3]]

    localization_loss = torch.sum((target_obj[:,1]-predicted_object[:,1])**2 +
                                  (target_obj[:,2]-predicted_object[:,2])**2)

    bbox_loss         = torch.sum((torch.sqrt(target_obj[:,3])-torch.sqrt(predicted_object[:,3]))**2 +
                                  (torch.sqrt(target_obj[:,4])-torch.sqrt(predicted_object[:,4]))**2)
    
    pc_loss = torch.sum((target_obj[:,5:]-predicted_object[:,5:])**2)
    
    ### IOU 
    iou=bbox_iou(predicted_object[:,1:5],target_obj[:,1:5],object_idx)

    obj_conf_loss=torch.sum((iou.detach()-predicted_object[:,0])**2)
    nobj_confidence_loss= torch.sum((0 - predicted_no_object[:, 0])**2)

    loss = 5.0*(localization_loss + bbox_loss) + obj_conf_loss + 0.5*nobj_confidence_loss + pc_loss
    # print(localization_loss,obj_conf_loss)
    return loss



In [158]:
def collate_fn(batch):
    imgs = []
    targets = []
    for i, (img, labels) in enumerate(batch):
        imgs.append(img)

        if labels.numel() > 0:
            # add batch index column
            batch_idx = torch.full((labels.shape[0], 1), i) 
            labels = torch.cat((batch_idx, labels), dim=1)
            targets.append(labels)

    imgs = torch.stack(imgs)
    if len(targets):
        targets = torch.cat(targets, dim=0)
    else:
        targets = torch.zeros((0, 6))
    return imgs, targets

In [159]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import os
from torchvision import transforms

class YOLODataset(Dataset):
    def __init__(self, img_dir, label_dir, img_size=(640,192), transform=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.img_files = [f for f in os.listdir(img_dir) if f.endswith(".jpg")]
        self.img_size = img_size
        self.transform = transforms.ToTensor()

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_name = self.img_files[idx]
        img_path = os.path.join(self.img_dir, img_name)
        label_path = os.path.join(self.label_dir, img_name.replace(".jpg", ".txt"))

        # --- Load and normalize image ---
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)  # shape: [3, H, W], range [0,1]
        image=image.permute(0,2,1)
        # --- Load YOLO labels ---
        boxes = []
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    cls, x1, y1, x2, y2 = map(float, parts)
                    boxes.append([cls, x1, y1, x2, y2])

        if len(boxes) == 0:
            boxes = torch.zeros((0, 5), dtype=torch.float32)
        else:
            boxes = torch.tensor(boxes, dtype=torch.float32)

        return image, boxes
    
dataset = YOLODataset("../data/656x208/images", "../data/656x208/label")
loader = DataLoader(dataset, batch_size=32, shuffle=True,collate_fn=collate_fn)

In [160]:

import torch
import matplotlib.pyplot as plt
from torch.amp import autocast, GradScaler

# =========================
# TRAINING CONFIG
# =========================
num_epochs = 10

# --- Model / Loss / Optimizer ---
model = YOLO(num_classes=9).to('cuda')

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

model.train()

log_interval = 100  # print every N batches
scaler = GradScaler(device)

train_losses = []

print(f"Starting training on {device}")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0

    for batch_idx, (imgs, targets) in enumerate(loader):
        # -----------------------------
        # Move to GPU
        # -----------------------------
        imgs = imgs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)  # [batch_id, cls, x, y, w, h]

        optimizer.zero_grad(set_to_none=True)

        # -----------------------------
        # Forward + Loss (with AMP)
        # -----------------------------
        with autocast("cuda"):
            out = model(imgs)
            # print(imgs.shape)
            # print(targets.shape) #([N, 6]) -> (B, cls,cx,cy,w,h)
            # print(out.shape)
            
        loss=yolo_loss(out,targets,anchors)
        # # -----------------------------
        # # Backward + Step
        # # -----------------------------
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # # -----------------------------
        # # Logging
        # # -----------------------------
        epoch_loss += loss.item()

        if batch_idx % log_interval == 0:
            print(
                f"[Epoch {epoch+1}/{num_epochs}] "
                f"[Batch {batch_idx}/{len(loader)}] "
                f"Loss: {loss.item():.4f}"
            )

        # # cleanup
        # del imgs, targets, out
        torch.cuda.empty_cache()

    # -----------------------------
    # Epoch summary
    # -----------------------------
    avg_loss = epoch_loss / len(loader)
    train_losses.append(avg_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}] Avg Loss: {avg_loss:.4f}")

# =========================
# PLOT LOSS CURVE
# =========================
plt.figure(figsize=(8,5))
plt.plot(range(1, num_epochs + 1), train_losses, marker='o', linewidth=2)
plt.title("Training Loss Curve")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.grid(True)
plt.show()


Starting training on cuda


  target = torch.tensor(target.to(device), dtype=torch.float32)  # shape: (N, 6)


ValueError: only one element tensors can be converted to Python scalars

In [None]:
# import torch
# from PIL import Image, ImageDraw, ImageFont
# import matplotlib.pyplot as plt
# import os
# import random

# # ======= CONFIG =======
# IMG_PATH = "./data/640x192/images"   # change this

# NUM_CLASSES = 9
# S = 20  # your grid size

# IMG_SIZE = (640, 192)

# # ======= LOAD MODEL =======
# device = "cuda" if torch.cuda.is_available() else "cpu"

# model.eval()

# # ======= PREPROCESS IMAGE =======


# IMG_DIR = "../data/640x192/images"
# device = "cuda" if torch.cuda.is_available() else "cpu"

# # pick a random image file
# img_name = random.choice([f for f in os.listdir(IMG_DIR) if f.endswith((".jpg", ".png"))])
# # img_name='000002.jpg'
# img_path = os.path.join(IMG_DIR, img_name)

# # open and preprocess image
# img = Image.open(img_path).convert("RGB")
# img_tensor = (
#     torch.from_numpy(np.array(img)).permute(2, 0, 1).float() / 255.0
# ).unsqueeze(0).to(device) 

# img_tensor=img_tensor.permute(0,1,3,2).to('cpu')
# # ======= INFERENCE =======
# with torch.no_grad():
#     preds = model(img_tensor)[0]  # (C, S, S)

# conf = preds[0]        # confidence map
# x = preds[1]
# y = preds[2]
# w = preds[3]
# h = preds[4]
# cls_scores = torch.softmax(preds[5:], dim=0)

# # ======= DECODE BOXES =======
# boxes = []
# for i in range(40):
#     for j in range(12):
#         if conf[i, j] > 0.02:
#             cx = (i + x[i, j]) * (640/40)
#             cy = (j + y[i, j]) * (192/12)
#             bw = w[i, j] * IMG_SIZE[0]
#             bh = h[i, j] * IMG_SIZE[1]

#             x1 = cx - bw / 2
#             y1 = cy - bh / 2
#             x2 = cx + bw / 2
#             y2 = cy + bh / 2

#             cls_idx = torch.argmax(cls_scores[:, i, j]).item()
#             boxes.append((x1, y1, x2, y2, cls_idx, conf[i, j].item()))

# # # ======= VISUALIZE =======
# # draw = ImageDraw.Draw(img)
# # for (x1, y1, x2, y2, cls, conf) in boxes:
# #     draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
# #     draw.text((x1, y1 - 10), f"{cls}:{conf:.6f}", fill="yellow")

# plt.figure(figsize=(20, 6))
# plt.imshow(img)
# plt.axis("off")
# plt.show()

# import matplotlib.pyplot as plt

# print(preds.shape)

# plt.figure(figsize=(10, 6))
# plt.imshow(preds[0,:].permute(1,0).cpu(), cmap='inferno', interpolation='nearest')
# plt.colorbar(label='Confidence')
# plt.title("Confidence Map (40Ã—12 grid)")
# plt.show()
