<a href="https://colab.research.google.com/github/Artyom35689/Ad_bot/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import torch, sys, platform
import torch.nn as nn
import os, json, random, math, cv2, torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [15]:
# COCO val2017 (≈1 ГБ) + аннотации
!mkdir -p data/coco && cd data/coco
!wget http://images.cocodataset.org/zips/val2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip -q val2017.zip
!unzip -q annotations_trainval2017.zip
!rm -f val2017.zip annotations_trainval2017.zip
!cd ../../


--2025-10-07 14:43:11--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 54.231.128.201, 16.182.70.49, 3.5.29.183, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|54.231.128.201|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘val2017.zip’

val2017.zip          26%[====>               ] 204.03M  55.3MB/s    eta 13s    ^C
--2025-10-07 14:43:15--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 54.231.128.201, 16.182.70.49, 3.5.29.183, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|54.231.128.201|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’

    annotations_tra  20%[===>                ]  49.01M  34.8MB/s               ^C
[val2017.zip]
 

In [19]:
def layer(input,output,kernel,stride=1,padding=None):
        return nn.Sequential(
            nn.Conv2d(input,output,kernel,stride,padding if padding is not None else kernel//2,bias=False),
            nn.BatchNorm2d(output),
            nn.LeakyReLU(0.1,inplace=True)
        )

class YOLOv1(nn.Module):
    def __init__(self, S=7, B=2, C=1):
        super(YOLOv1, self).__init__()
        self.S = S  # Grid size
        self.B = B  # Number of bounding boxes per grid cell
        self.C = C  # Number of classes, we will use 1 for person detection

        L =[]
        L+=[layer(3,64,7,2,3),  nn.MaxPool2d(2,2)]
        L+=[layer(64,192,3),    nn.MaxPool2d(2,2)]
        L+=[layer(192,128,1),   layer(128,256,3),       layer(256,256,1), layer(256,512,3),  nn.MaxPool2d(2,2)]
        L+=[layer(512,256,1),   layer(256,512,3)]*4 + [ layer(512,512,1), layer(512,1024,3), nn.MaxPool2d(2,2)]
        L+=[layer(1024,512,1),  layer(512,1024,3)]*2
        L+=[layer(1024,1024,3), layer(1024,1024,3,2,1)]
        L+=[layer(1024,1024,3), layer(1024,1024,3)]
        self.backbone = nn.Sequential(*L)
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, S * S * (C + B * 5))
        )
    def forward(self, x):
        x = self.backbone(x)
        x = self.fc(x)
        # [batch_size, S*S*(C+5*B)] -> [batch_size, S, S, C+5*B]
        # [C+5*B] = [x,y,w,h,conf1,conf2,...,confB,class1,class2,...,classC]
        x = x.view(-1, self.S, self.S, self.C + self.B * 5)
        return x



In [20]:
def _iou_xywh(a, b, eps=1e-9):
    # a,b: [...,4] in cx,cy,w,h (abs in [0,1])
    a_x1 = a[...,0] - a[...,2]/2; a_y1 = a[...,1] - a[...,3]/2
    a_x2 = a[...,0] + a[...,2]/2; a_y2 = a[...,1] + a[...,3]/2
    b_x1 = b[...,0] - b[...,2]/2; b_y1 = b[...,1] - b[...,3]/2
    b_x2 = b[...,0] + b[...,2]/2; b_y2 = b[...,1] + b[...,3]/2
    inter = (torch.minimum(a_x2,b_x2)-torch.maximum(a_x1,b_x1)).clamp_(min=0) * \
            (torch.minimum(a_y2,b_y2)-torch.maximum(a_y1,b_y1)).clamp_(min=0)
    area_a = (a_x2-a_x1).clamp(min=0)*(a_y2-a_y1).clamp(min=0)
    area_b = (b_x2-b_x1).clamp(min=0)*(b_y2-b_y1).clamp(min=0)
    return inter / (area_a + area_b - inter + eps)

class YoloV1Loss(nn.Module):
    def __init__(self,S=7,B=2,lambda_coord=5.0,lambda_noobj=0.5):
        super().__init__()
        self.S,self.B = S,B
        self.lc, self.lno = lambda_coord, lambda_noobj

    def forward(self,pred,target):
        """
        pred: [N,S,S,B*5+1]
        target: dict(tconf[N,S,S,1], txywh[N,S,S,4], tcls[N,S,S,1])
        """
        N,S,B = pred.size(0), self.S, self.B
        device = pred.device

        pb = pred[...,:B*5].view(N,S,S,B,5)   # x,y,sw,sh,conf (sw,sh ~ sqrt(w,h))
        pcls = pred[...,B*5:]                 # one-class logit

        px = pb[...,0].sigmoid()
        py = pb[...,1].sigmoid()
        # стабильные толщины: softplus ≥0
        sw = torch.nn.functional.softplus(pb[...,2]).clamp(max=10.0)
        sh = torch.nn.functional.softplus(pb[...,3]).clamp(max=10.0)
        pc = pb[...,4].sigmoid()
        pcl = pcls.sigmoid()

        # абсолютные боксы
        gy, gx = torch.meshgrid(torch.arange(S, device=device),
                                torch.arange(S, device=device), indexing='ij')
        gx = gx.view(1,S,S,1).float(); gy = gy.view(1,S,S,1).float()
        bx = (gx + px)/S
        by = (gy + py)/S
        bw = (sw ** 2).clamp(min=1e-9, max=1.0)
        bh = (sh ** 2).clamp(min=1e-9, max=1.0)
        boxes_abs = torch.stack([bx,by,bw,bh], dim=-1)  # [N,S,S,B,4]

        # таргеты
        tconf = target['tconf'].to(device)     # [N,S,S,1]
        txywh = target['txywh'].to(device)     # [N,S,S,4]
        tcls  = target['tcls' ].to(device)     # [N,S,S,1]

        # выбор ответственного бокса
        ious = _iou_xywh(boxes_abs, txywh.unsqueeze(3).expand_as(boxes_abs))  # [N,S,S,B]
        iou_max, argmax = ious.max(dim=3, keepdim=True)
        obj_mask = tconf                                      # [N,S,S,1]
        resp = torch.zeros_like(ious)
        resp.scatter_(3, argmax, 1.0)
        resp = resp * obj_mask                                # [N,S,S,B]

        # координатный терм в "sqrt-пространстве"
        pxr = (px*resp).sum(dim=3, keepdim=True)
        pyr = (py*resp).sum(dim=3, keepdim=True)
        swr = (sw*resp).sum(dim=3, keepdim=True)
        shr = (sh*resp).sum(dim=3, keepdim=True)

        tx,ty,tw,th = txywh.unbind(-1)     # [N,S,S]
        tx = tx.unsqueeze(-1) * S - gx
        ty = ty.unsqueeze(-1) * S - gy
        tsw = (tw.unsqueeze(-1).clamp(min=1e-9)).sqrt()
        tsh = (th.unsqueeze(-1).clamp(min=1e-9)).sqrt()

        coord = ((pxr - tx)**2 + (pyr - ty)**2 +
                 (swr - tsw)**2 + (shr - tsh)**2)
        coord = self.lc * (coord * obj_mask).sum()

        # confidence: к IoU для ответственного, 0 для остальных
        conf_tgt_resp = (ious * resp).detach()
        conf_obj   = ((pc - conf_tgt_resp)**2 * resp).sum()
        conf_noobj = self.lno * ((pc**2) * (1.0 - resp)).sum()
        conf = conf_obj + conf_noobj

        # class (C=1)
        cls = ((pcl - tcls)**2 * obj_mask).sum()

        total = coord + conf + cls
        total = torch.nan_to_num(total, nan=0.0, posinf=1e9, neginf=0.0) / max(N,1)

        logs = {
            'loss': float(total.detach().cpu()),
            'L_coord': float((coord/max(N,1)).detach().cpu()),
            'L_conf_obj': float((conf_obj/max(N,1)).detach().cpu()),
            'L_conf_noobj': float((conf_noobj/max(N,1)).detach().cpu()),
            'L_cls': float((cls/max(N,1)).detach().cpu())
        }
        # простые метрики
        with torch.no_grad():
            any_conf = (pc > 0.5).any(dim=3, keepdim=True).float()  # [N,S,S,1]
            obj_acc = float((any_conf == tconf).float().mean().cpu())
            mean_iou = float(((iou_max * obj_mask).sum() / (obj_mask.sum().clamp(min=1))).detach().cpu())
            logs['obj_acc'] = obj_acc
            logs['mean_iou'] = mean_iou
        return total, logs




In [22]:
S, B = 7, 2
IM = 448
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("high")

class CocoPersonMini(Dataset):
    def __init__(self, root="", limit=1000):
        self.img_dir = os.path.join(root,"val2017")
        ann = json.load(open(os.path.join(root,"annotations","instances_val2017.json")))
        pid=1
        anns_by={}
        for a in ann["annotations"]:
            if a["category_id"]!=pid: continue
            iid=a["image_id"]; x,y,w,h=a["bbox"]
            if w<=1 or h<=1: continue
            anns_by.setdefault(iid, []).append((x,y,w,h))
        id2img={im["id"]:im for im in ann["images"]}
        imgs=[(iid,id2img[iid]) for iid in anns_by.keys()]
        random.seed(0); random.shuffle(imgs); imgs=imgs[:limit]
        self.items=[]
        for iid,meta in imgs:
            path=os.path.join(self.img_dir, meta["file_name"])
            if not os.path.exists(path): continue
            H,W=meta["height"],meta["width"]
            boxes=[]
            for (x,y,w,h) in anns_by[iid]:
                cx=(x+w/2)/W; cy=(y+h/2)/H; bw=w/W; bh=h/H
                if 0<cx<1 and 0<cy<1 and bw>0 and bh>0:
                    boxes.append([cx,cy,bw,bh])
            if boxes: self.items.append((path,boxes))
        print("COCO val2017 person:", len(self.items))

    def __len__(self): return len(self.items)

    def __getitem__(self,i):
        path,boxes=self.items[i]
        img0=cv2.imread(path)
        if img0 is None: raise FileNotFoundError(path)
        img0=cv2.cvtColor(img0, cv2.COLOR_BGR2RGB)
        H0,W0=img0.shape[:2]
        scale=min(IM/W0, IM/H0)
        nw,nh=int(W0*scale+0.5),int(H0*scale+0.5)
        img=cv2.resize(img0,(nw,nh), interpolation=cv2.INTER_LINEAR)
        canvas=np.zeros((IM,IM,3),np.uint8)
        top=(IM-nh)//2; left=(IM-nw)//2
        canvas[top:top+nh,left:left+nw]=img

        adj=[]
        for (cx,cy,bw,bh) in boxes:
            cx_n=(cx*W0*scale + left)/IM
            cy_n=(cy*H0*scale + top)/IM
            bw_n=(bw*W0*scale)/IM
            bh_n=(bh*H0*scale)/IM
            if 0<cx_n<1 and 0<cy_n<1 and bw_n>0 and bh_n>0:
                adj.append([cx_n,cy_n,bw_n,bh_n])

        img_t=torch.from_numpy(canvas).permute(2,0,1).float()/255.0
        tconf=torch.zeros((S,S,1),dtype=torch.float32)
        txywh=torch.zeros((S,S,4),dtype=torch.float32)
        tcls =torch.zeros((S,S,1),dtype=torch.float32)
        occ=set()
        for (cx,cy,bw,bh) in adj:
            gx=min(S-1,int(cx*S)); gy=min(S-1,int(cy*S))
            if (gy,gx) in occ: continue  # 1 объект на клетку
            occ.add((gy,gx))
            tconf[gy,gx,0]=1.0
            txywh[gy,gx,:]=torch.tensor([cx,cy,bw,bh])
            tcls[gy,gx,0]=1.0
        return img_t, {'tconf':tconf,'txywh':txywh,'tcls':tcls}

def collate(batch):
    imgs,T=zip(*batch)
    imgs=torch.stack(imgs,0)
    out={'tconf':torch.stack([t['tconf'] for t in T],0),
         'txywh':torch.stack([t['txywh'] for t in T],0),
         'tcls' :torch.stack([t['tcls']  for t in T],0)}
    return imgs,out

# ---------- ТРЕНИРОВКА ----------
def one_epoch(model, loss_fn, loader, opt, device):
    model.train()
    avg = {'loss':0,'L_coord':0,'L_conf_obj':0,'L_conf_noobj':0,'L_cls':0,'obj_acc':0,'mean_iou':0}
    n=0
    for imgs, tgt in tqdm(loader, ncols=80):
        imgs = imgs.to(device, non_blocking=True)
        tgt  = {k:v.to(device, non_blocking=True) for k,v in tgt.items()}
        opt.zero_grad(set_to_none=True)
        pred = model(imgs)
        loss, logs = loss_fn(pred, tgt)
        if torch.isnan(loss):
            continue
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        opt.step()
        for k in avg: avg[k]+=logs.get(k,0.0)
        n+=1
    for k in avg: avg[k]/=max(n,1)
    return avg

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    ds = CocoPersonMini(limit=1000)
    dl = DataLoader(ds, batch_size=4, shuffle=True, num_workers=2,
                    pin_memory=True, persistent_workers=False, collate_fn=collate)

    model = YOLOv1(S=7,B=2,C=1).to(device)
    loss_fn = YoloV1Loss(S=7,B=2,lambda_coord=5.0,lambda_noobj=0.5)
    opt = torch.optim.SGD(model.parameters(), lr=3e-3, momentum=0.9, weight_decay=5e-4)

    logs = one_epoch(model, loss_fn, dl, opt, device)
    print(logs)
    # torch.save(model.state_dict(), "yolov1_person_1epoch.pth")

COCO val2017 person: 1000


100%|█████████████████████████████████████████| 250/250 [00:53<00:00,  4.66it/s]


{'loss': 7.581509563446045, 'L_coord': 4.578233995437622, 'L_conf_obj': 0.14163775842264295, 'L_conf_noobj': 2.4916467794179917, 'L_cls': 0.36999103239923714, 'obj_acc': 0.8837346683740616, 'mean_iou': 0.20877978977560996}
