In [2]:
!pip install ultralytics --quiet
from tqdm import tqdm
import os
import timm, torch
import torchvision
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.functional import cross_entropy
from torch.utils import data
from torch.utils.data import DataLoader
from torchvision.ops import batched_nms
import numpy
import math
import cv2
import random
from PIL import Image
import copy
from time import time
import yaml
from pathlib import Path
from dataset import *
from util import *

In [None]:
class Conv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.03)
        self.act = nn.SiLU(inplace=True)
    
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))
    
class Bottleneck(nn.Module):
    def __init__(self, in_channels, out_channels, shortcut=True):
        super().__init__()
        self.cv1 = Conv(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.cv2 = Conv(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.add = shortcut

    def forward(self, x):
        x_in = x
        x = self.cv1(x)
        x = self.cv2(x)
        if self.add:
            x += x_in
        return x
        

class C2f(nn.Module):
    def __init__(self, in_channels, out_channels, num_bottlenecks, shortcut=True):
        super().__init__()
        self.mid_channels = out_channels // 2
        self.num_bottlenecks = num_bottlenecks
        self.cv1 = Conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
        self.cv2 = Conv((num_bottlenecks+2)*out_channels//2, out_channels, kernel_size=1, stride=1, padding=0)
        self.m = nn.ModuleList([Bottleneck(self.mid_channels, self.mid_channels, shortcut) for _ in range(num_bottlenecks)]) # n bottlenecks
        self.add = shortcut
    
    def forward(self, x):
        x = self.cv1(x)
        x1, x2 = x[:, :x.shape[1]//2, :, :], x[:, x.shape[1]//2:, :, :]
        outputs = [x1, x2] # x1 is fed to the bottlenecks

        for i in range(self.num_bottlenecks):
            x1 = self.m[i](x1)
            outputs.insert(0, x1)
        
        outputs = torch.cat(outputs, dim=1)
        out = self.cv2(outputs)
        return out

    
class SPPF(nn.Module): # EXPLORE WHY!!!!
    def __init__(self, in_channels, out_channels, kernel_size=5): #kernel_size = size of maxpool
        super().__init__()
        hidden_channels = in_channels // 2
        self.cv1 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, padding=0) # WHY???
        self.pool = nn.MaxPool2d(kernel_size=kernel_size, stride=1, padding=kernel_size//2, dilation=1, ceil_mode=False) # WHY???
        self.cv2 = Conv(4*hidden_channels, out_channels, kernel_size=1, stride=1, padding=0)
    
    def forward(self, x):
        x = self.cv1(x)

        y1 = self.pool(x)
        y2 = self.pool(y1)
        y3 = self.pool(y2)

        y = torch.cat([x,y1,y2,y3], dim=1)
        
        y = self.cv2(y)
        return y


class Concat(nn.Module):
    def __init__(self, dim=1): 
        super().__init__()
        self.dim = dim
    def forward(self, xs):       # xs is a tuple/list of tensors
        return torch.cat(xs, self.dim)



class DFL(nn.Module):
    def __init__(self, ch=16):
        super().__init__()
        self.ch = ch
        self.conv = nn.Conv2d(in_channels=ch, out_channels=1, kernel_size=1, stride=1, padding=0, bias=False).requires_grad_(False)

        x = torch.arange(self.ch, dtype=torch.float).view(1, self.ch, 1, 1)
        self.conv.weight.data.copy_(x)
    
    def forward(self, x): # x = [B, C_in, c]
        b, c, a = x.shape # b = B  c = C_in = 4*ch  a = c
        x = x.view(b, 4, self.ch, a).transpose(1, 2)  # [B, ch(values), 4, c]

        x = x.softmax(1)  # [B, ch(softmax values), 4, c]
        x = self.conv(x)  # [B, 1, 4, c]
        return x.view(b, 4, a)  # [B, 4, c] so it returns the l,t,r,b values(in bin) for every batch (we don't need the out channel of conv)


class Detect(nn.Module):
    def __init__(self, ch=16, nc=4):
        super().__init__()
        self.ch=ch                          # dfl channels
        self.box_ch=self.ch*4          # number of bounding boxes coordinates
        self.nc=nc                 # 4 for our dataset
        self.no=self.box_ch+self.nc    # num of outputs per anchor box
        self.stride=torch.tensor([8.,16.,32])          # strides computed during build
        d,w,r = (1/3,1/4,2.0)

        self.cv2=nn.ModuleList([
            # for box
            nn.Sequential(Conv(int(256*w), self.box_ch, kernel_size=3, stride=1, padding=1),
                          Conv(self.box_ch, self.box_ch, kernel_size=3, stride=1, padding=1),
                          nn.Conv2d(self.box_ch, self.box_ch, kernel_size=1, stride=1, padding=0)),
            
            nn.Sequential(Conv(int(512*w), self.box_ch, kernel_size=3, stride=1, padding=1),
                          Conv(self.box_ch, self.box_ch, kernel_size=3, stride=1, padding=1),
                          nn.Conv2d(self.box_ch, self.box_ch, kernel_size=1, stride=1, padding=0)),
            
            nn.Sequential(Conv(int(512*w*r), self.box_ch, kernel_size=3, stride=1, padding=1),
                          Conv(self.box_ch, self.box_ch, kernel_size=3, stride=1, padding=1),
                          nn.Conv2d(self.box_ch, self.box_ch, kernel_size=1, stride=1, padding=0)),
        ])

        # for classification
        self.cv3=nn.ModuleList([
            nn.Sequential(Conv(int(256*w), self.nc, kernel_size=3, stride=1, padding=1),
                          Conv(self.nc, self.nc, kernel_size=3, stride=1, padding=1),
                          nn.Conv2d(self.nc, self.nc, kernel_size=1, stride=1, padding=0)),
            
            nn.Sequential(Conv(int(512*w), self.nc, kernel_size=3, stride=1, padding=1),
                          Conv(self.nc, self.nc, kernel_size=3, stride=1, padding=1),
                          nn.Conv2d(self.nc, self.nc, kernel_size=1, stride=1, padding=0)),
            
            nn.Sequential(Conv(int(512*w*r), self.nc, kernel_size=3, stride=1, padding=1),
                          Conv(self.nc, self.nc, kernel_size=3, stride=1, padding=1),
                          nn.Conv2d(self.nc, self.nc, kernel_size=1, stride=1, padding=0)),
        ])

        # dfl
        self.dfl = DFL()

    def forward(self, x): # x = (out1,out2,out3), outx = [B, chx, wx, hx]
        outs = []
        for i in range(len(self.cv2)):
            box = self.cv2[i](x[i])     # [b, 4*reg_bins, w, h]
            cls = self.cv3[i](x[i])     # [b, num_classes, w, h]
            o = torch.cat((box, cls), 1)
            outs.append(o) # [b, 4*reg_bins+num_classes, w, h] 

        # in training no dfl output
        # if self.training:
        return outs    # [3,b,4*reg_bins+num_classes,w,h]

        
        
        # in inference, dfl produces refined bounding box coordinates
        # anchors, strides = (i.transpose(0, 1) for i in self.make_anchors(x, self.stride))

        # x = torch.cat([i.view(x[0].shape[0], self.no, -1) for i in x], dim=2)

        # box, cls = x.split(split_size=(4*self.ch, self.nc), dim=1)

        # a, b = self.dfl(box).chunk(2, 1)    # a=b=[b,2*self.ch,sum_i(h[i]w[i])]
        # a = anchors.unsqueeze(0) - a
        # b = anchors.unsqueeze(0) + b
        # box = torch.cat(tensors=((a + b) / 2, b - a), dim=1)

        # return torch.cat(tensors=(box * strides, cls.sigmoid()), dim=1)
    
    def make_anchors(self, x, strides, offset=0.5):
        assert x is not None
        anchor_tensor, stride_tensor = [], []
        dtype, device = x[0].dtype, x[0].device
        for i, stride in enumerate(strides):
            _, _, h, w = x[i].shape
            sx = torch.arange(end=w, device=device, dtype=dtype) + offset 
            sy = torch.arange(end=h, device=device, dtype=dtype) + offset
            sy, sx = torch.meshgrid(sy, sx)
            anchor_tensor.append(torch.stack((sx, sy), -1).view(-1, 2))
            stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
        return torch.cat(anchor_tensor), torch.cat(stride_tensor)
    


class YOLO(nn.Module):
    def __init__(self, task=None, verbose=False, in_channels=3):
        super().__init__()
        d, w, r = (1/3,1/4,2.0)
        self.predictor = None  # reuse predictor
        self.model = None  # model object
        self.trainer = None  # trainer object
        self.ckpt = {}  # if loaded from *.pt
        self.cfg = None  # if loaded from *.yaml
        self.ckpt_path = None
        self.overrides = {}  # overrides for trainer object
        self.metrics = None  # validation/training metrics
        self.session = None  # HUB session
        self.task = task  # task type
        self.model_name = None  # model name
        self.model = nn.ModuleList([
            # backbone
            Conv(in_channels, int(64*w), kernel_size=3, stride=2, padding=1),              #0
            Conv(int(64*w), int(128*w), kernel_size=3, stride=2, padding=1),               #1
            C2f(int(128*w), int(128*w), num_bottlenecks=int(3*d), shortcut=True),          #2
            Conv(int(128*w), int(256*w), kernel_size=3, stride=2, padding=1),              #3
            C2f(int(256*w), int(256*w), num_bottlenecks=int(6*d), shortcut=True),          #4
            Conv(int(256*w), int(512*w), kernel_size=3, stride=2, padding=1),              #5
            C2f(int(512*w), int(512*w), num_bottlenecks=int(6*d), shortcut=True),          #6
            Conv(int(512*w), int(512*w*r), kernel_size=3, stride=2, padding=1),            #7
            C2f(int(512*w*r), int(512*w*r), num_bottlenecks=int(3*d), shortcut=True),      #8
            SPPF(int(512*w*r), int(512*w*r)),                                              #9

            # neck
            nn.Upsample(scale_factor=2, mode='nearest'),                                   #10
            Concat(),                                                                      #11
            C2f(int(512*w*(1+r)), int(512*w), num_bottlenecks=int(3*d), shortcut=False),   #12
            nn.Upsample(scale_factor=2, mode='nearest'),                                   #13
            Concat(),                                                                      #14
            C2f(int(768*w), int(256*w), num_bottlenecks=int(3*d), shortcut=False),         #15
            Conv(int(256*w), int(256*w), kernel_size=3, stride=2, padding=1),              #16
            Concat(),                                                                      #17
            C2f(int(768*w), int(512*w), num_bottlenecks=int(3*d), shortcut=False),         #18
            Conv(int(512*w), int(512*w), kernel_size=3, stride=2, padding=1),              #19
            Concat(),                                                                      #20
            C2f(int(512*w*(1+r)), int(512*w*r), num_bottlenecks=int(3*d), shortcut=False), #21

            # head
            Detect(),                                                                      #22
        ])
        # Delete super().training for accessing self.model.training
        del self.training

    def forward(self, x):
        # backbone forward
        x = self.model[0](x)
        x = self.model[1](x)
        x = self.model[2](x)
        x = self.model[3](x)
        out1 = self.model[4](x) # for concat
        x = self.model[5](out1)
        out2 = self.model[6](x) # for concat
        x = self.model[7](out2)
        x = self.model[8](x)
        out3 = self.model[9](x)

        # neck forward
        res_1 = out3 # for residual connection
        x = self.model[10](out3)
        x = self.model[11]((x, out2))
        res_2 = self.model[12](x) # for concat
        x = self.model[13](res_2)
        x = self.model[14]((x, out1))
        x1 = self.model[15](x) # for detect
        x = self.model[16](x1)
        x = self.model[17]((x, res_2))
        x2 = self.model[18](x) # for detect
        x = self.model[19](x2)
        x = self.model[20]((x, res_1))
        x3 = self.model[21](x) # for detect

        return self.model[22]([x1,x2,x3])
    
    def _decode_for_nms(self, outputs):
        detect_module = self.model[22]          # your Detect head
        B = outputs[0].shape[0]
        no = detect_module.no                   # box_ch + nc

        # (B, no, N) by flattening each head map safely
        x = torch.cat([o.reshape(B, -1, no).permute(0, 2, 1) for o in outputs], dim=2)

        # split l,t,r,b distribution and class logits
        boxes_dist, scores = x.split((detect_module.box_ch, detect_module.nc), dim=1)

        # anchors:(N,2), strides:(N,) → make broadcastable
        anchors, strides = detect_module.make_anchors(outputs, detect_module.stride)
        anchors = anchors.transpose(0, 1).unsqueeze(0)   # (1,2,N)
        strides = strides.view(1, 1, -1)                 # (1,1,N)

        # DFL → (B,4,N) of l,t,r,b (grid/anchor units)
        dist = detect_module.dfl(boxes_dist)
        lt, rb = dist.split(2, dim=1)                    # (B,2,N), (B,2,N)

        # XYXY in pixels of the network input (after resize/letterbox)
        xyxy = torch.cat((anchors - lt, anchors + rb), dim=1) * strides  # (B,4,N)

        # confidences (if your head has objectness, multiply it in here)
        conf = scores.sigmoid()                          # (B,nc,N)

        return torch.cat((xyxy, conf), dim=1)            # (B, 4+nc, N)

    def train(self, mode: bool | None = None, **ultra_kwargs):
        # 1) Pure PyTorch toggle if mode is given and no kwargs
        if (mode is not None) and (not ultra_kwargs):
            return super().train(mode)

        # 2) Ultralytics-style API when kwargs are provided
        if ultra_kwargs:
            return self._ultra_train(**ultra_kwargs)

        # 3) Default: behave like model.train(True)
        return super().train(True)

    # ---- the high-level trainer ----
    def _ultra_train(
        self,
        data: str,
        epochs: int = 100,
        imgsz: int = 640,
        batch: int = 16,
        name: str = "exp",
        project: str = "runs/train",
        device: int | str = 0,
        patience: int = 50,
        cos_lr: bool = True,
        lr: float = 5e-4,
        weight_decay: float = 5e-4,
        workers: int = 8,
        amp: bool = True,
        grad_clip: float | None = 10.0,
    ):
        if isinstance(device, (int, str)) and str(device).isdigit() and torch.cuda.is_available():
            device = torch.device(f"cuda:{device}")
        else:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(device)

        run_dir = os.path.join(project, name)
        os.makedirs(run_dir, exist_ok=True)
        best_path, last_path = os.path.join(run_dir, "best.pt"), os.path.join(run_dir, "last.pt")

        with open(data, "r") as f:
            cfg = yaml.safe_load(f)
        nc = cfg.get("nc", len(cfg["names"]))
        names = cfg.get("names")

        base = Path(data)
        p = Path(cfg['train'])
        newP = (base / p).resolve()

        train_ds = YoloDataset((base / Path(cfg["train"])).resolve(), imgsz=imgsz, names=names)           # <-- your class
        val_path = cfg.get("val")
        val_ds   = YoloDataset((base / Path(val_path)).resolve(), imgsz=imgsz, names=names) if val_path else None

        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True,
                                  num_workers=workers, pin_memory=True,
                                  collate_fn=train_ds.collate_fn)
        val_loader = None
        if val_ds:
            val_loader = DataLoader(val_ds, batch_size=batch, shuffle=False,
                                    num_workers=max(1, workers//2), pin_memory=True,
                                    collate_fn=val_ds.collate_fn, drop_last=False)

        def _gpu_mem_str(device):
            if device.type == "cuda":
                try:
                    m = torch.cuda.memory_reserved(device.index if device.index is not None else 0)
                except Exception:
                    m = torch.cuda.max_memory_allocated()
                return f"{m / (1024**3):>7.2f}G"
            return f"{0.0:>7.2f}G"
        
        class _EMAval:
            def __init__(self, beta=0.9): self.b=beta; self.v=None
            def upd(self,x): x=float(x); self.v=x if self.v is None else self.b*self.v+(1-self.b)*x; return self.v
        
        def _epoch_header():
            print(f"{'Epoch':>10} {'GPU_mem':>9} {'box_loss':>9} {'cls_loss':>9} {'dfl_loss':>9} {'Instances':>10} {'Size':>10}")

        # loss, opt, sched

        params = {
            'min_lr': 0.000100000000,
            'max_lr': 0.010000000000,
            'momentum': 0.9370000000,
            'weight_decay': 0.000500,
            'warmup_epochs': 3.00000,
            'box': 7.500000000000000,
            'cls': 0.500000000000000,
            'dfl': 1.500000000000000,
            'hsv_h': 0.0150000000000,
            'hsv_s': 0.7000000000000,
            'hsv_v': 0.4000000000000,
            'degrees': 0.00000000000,
            'translate': 0.100000000,
            'scale': 0.5000000000000,
            'shear': 0.0000000000000,
            'flip_ud': 0.00000000000,
            'flip_lr': 0.50000000000,
            'mosaic': 1.000000000000,
            'mix_up': 0.000000000000
        }
        params['nc'] = nc
        params['names'] = names
        
        criterion = ComputeLoss(self.model, params)
        optimizer = torch.optim.AdamW(self.parameters(), lr=lr, weight_decay=weight_decay)
        scheduler = (torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
                     if cos_lr else torch.optim.lr_scheduler.MultiStepLR(optimizer, [int(0.8*epochs)], gamma=0.1))
        scaler = torch.amp.GradScaler("cuda", enabled=(amp and device.type == "cuda"))

        # loop with early stopping on val loss
        best_val = float("inf"); bad_epochs = 0
        for epoch in range(epochs):
            super().train(True)  # PyTorch training mode
            _epoch_header()
            eb, ec, ed = _EMAval(), _EMAval(), _EMAval()
            epoch_loss = 0.0

            pbar = tqdm(enumerate(train_loader), total=len(train_loader), leave=True, ncols=120,
                bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]")
            
            for i, (imgs, targets) in pbar:
                imgs = imgs.to(device, non_blocking=True).float() / 255.0
                instances = int(targets["cls"].numel())

                optimizer.zero_grad(set_to_none=True)
                with torch.autocast("cuda", dtype=torch.float16, enabled=(amp and device.type == "cuda")):
                    outputs = self(imgs)  # training path → 3 feature maps
                    box_loss, cls_loss, dfl_loss = criterion(outputs, targets)
                    loss = box_loss + cls_loss + dfl_loss

                scaler.scale(loss).backward()
                if grad_clip is not None:
                    scaler.unscale_(optimizer)
                    nn.utils.clip_grad_norm_(self.parameters(), grad_clip)
                scaler.step(optimizer)
                scaler.update()
                
                epoch_loss += loss.item()
                b, c, d = eb.upd(box_loss.item()), ec.upd(cls_loss.item()), ed.upd(dfl_loss.item())
                desc = f"{epoch+1:>7}/{epochs:<3} {_gpu_mem_str(device)} {b:>9.3f} {c:>9.3f} {d:>9.3f} {instances:>10d} {imgsz:>10d}:"
                pbar.set_description_str(desc)
    

            scheduler.step()
            train_loss = epoch_loss / max(1, len(train_loader))

            val_loss = train_loss
            if val_loader is not None:
                super().eval()
                val_total = 0.0
                # header above the val bar
                print((" " * 17) + "Class     Images  Instances      Box(P          R      mAP50  mAP50-95):", end=" ")
                
                vbar = tqdm(enumerate(val_loader), total=len(val_loader), leave=True, ncols=120,
                    bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]")

                with torch.no_grad():
                    for imgs, targets in val_loader:
                        imgs = imgs.to(device, non_blocking=True).float() / 255.0
                        outputs = self(imgs)  # eval forward (no grads)

                        vb, vc, vd = criterion(outputs, targets)  # same criterion & weights as train
                        val_total += (vb + vc + vd).item()

                val_loss = val_total / max(1, len(val_loader))

            if val_loader is not None:
                print((" " * 17) + "Class     Images  Instances      Box(P          R      mAP50  mAP50-95):", end=" ")

                iouv = torch.linspace(0.5, 0.95, 10, device=device)
                tp_list, conf_list, pcls_list, tcls_list = [], [], [], []

                with torch.no_grad():
                    for imgs, targets in tqdm(enumerate(val_loader),
                                            total=len(val_loader), leave=True, ncols=120,
                                            bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"):
                        _, (imgs, targets) = _
                        imgs = imgs.to(device, non_blocking=True).float() / 255.0

                        # eval forward
                        outputs = self(imgs)

                        # decode for NMS (XYXY already)
                        detect_module = self.model[22]
                        processed_for_nms = _decode_for_nms(outputs, detect_module)

                        # run NMS (expects shape (B, 4+nc, N))
                        preds = non_max_suppression(processed_for_nms, confidence_threshold=0.001, iou_threshold=0.7)

                        B, _, H, W = imgs.shape
                        for b in range(B):
                            p = preds[b]  # [n_det, 6] = [x1,y1,x2,y2,conf,cls]
                            # ---- build GT in the SAME coordinate frame as preds ----
                            m = (targets["idx"] == b)
                            if m.any():
                                gt_cls = targets["cls"][m].to(device).squeeze(-1).to(torch.long)
                                # If your dataset normalizes to the network input size, this is OK:
                                gt_box = xywhn2xyxy_torch(targets["box"][m].to(device).float(), W, H)
                                # If you letterbox, ensure GT is mapped to the input frame (apply gain/pad).
                                t = torch.zeros((gt_cls.shape[0], 5), device=device, dtype=torch.float32)
                                t[:, 0] = gt_cls.float()
                                t[:, 1:] = gt_box
                            else:
                                t = torch.zeros((0, 5), device=device)

                            if p.numel() == 0:
                                if t.numel():
                                    tcls_list.append(t[:, 0].cpu().numpy().astype(int))
                                continue

                            # make sure classes are int for equality checks
                            p[:, 5] = p[:, 5].to(torch.long).float()

                            correct = compute_metric(p, t, iouv)  # [n_det, 10] bool
                            tp_list.append(correct.cpu().numpy().astype(int))
                            conf_list.append(p[:, 4].cpu().numpy())
                            pcls_list.append(p[:, 5].cpu().numpy())
                            if t.numel():
                                tcls_list.append(t[:, 0].cpu().numpy().astype(int))

                if conf_list:
                    tp   = numpy.concatenate(tp_list, 0)
                    conf = numpy.concatenate(conf_list, 0)
                    pcls = numpy.concatenate(pcls_list, 0).astype(int)
                    tcls = numpy.concatenate(tcls_list, 0).astype(int) if tcls_list else numpy.zeros((0,), dtype=int)
                    _, _, P, R, mAP50, mAP5095 = compute_ap(tp, conf, pcls, tcls)
                else:
                    P = R = mAP50 = mAP5095 = float("nan")
                    tcls = numpy.zeros((0,), dtype=int)

                val_images = len(val_loader.dataset)
                val_instances = int(tcls.shape[0])
                print(f"\n{'':>19}{'all':>7}{val_images:>11}{val_instances:>12}"
                    f"{P:>11.3f}{R:>11.3f}{mAP50:>11.3f}{mAP5095:>11.3f}\n")
        
            # ------------------- Save & early stop -------------------
            torch.save(self.state_dict(), last_path)
            improved = val_loss < best_val - 1e-6
            if improved:
                best_val, bad_epochs = val_loss, 0
                torch.save(self.state_dict(), best_path)
            else:
                bad_epochs += 1
        
            if bad_epochs >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
        return self
        # return {"best_val_loss": best_val, "best": best_path, "last": last_path}

In [None]:
# !cp -r /kaggle/input/effyolo /kaggle/working/

In [None]:
from collections import OrderedDict
from ultralytics import YOLO as UModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
u = UModel('yolov8n.pt')          # COCO-pretrained
model = YOLO().to(device)

tgt_sd = model.state_dict()
sd_src = u.model.state_dict()     # plain PyTorch state_dict
sd_shape_match = OrderedDict((k, v) for k, v in sd_src.items()
                             if k in tgt_sd and tgt_sd[k].shape == v.shape)

missing, unexpected = model.load_state_dict(sd_shape_match, strict=False)
print(len(missing), len(unexpected))

In [None]:
results = model.train(
    data='/kaggle/working/effyolo/data.yaml',
    epochs=200,
    imgsz=640,
    batch=8,
    name='fruit-disease-detector',
    project='/kaggle/working/runs/train',
    device='cuda',
    patience=50,
    cos_lr=True
)

In [None]:
print(results)

In [None]:
torch.save(model.state_dict(), "/kaggle/working/my_yolo8n.pt")