### References
The codes in this notebook use reference from the following sources
1. https://github.com/jahongir7174/YOLOv8-dfl/tree/master 
2. https://arxiv.org/html/2304.00501v6/#S16
3. https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo

In [1]:
# import
import math, os, random, cv2, numpy, torch
import torch.nn as nn
from ultralytics import YOLO

# explore the original yolov8 model

# original yolov8n
model_n=YOLO('yolov8n.pt')
print(f"yolov8-nano: {sum(p.numel() for p in model_n.parameters())/1e6} million parameters")


# model_s=YOLO('yolov8s.pt')
# print(f"yolov8-small: {sum(p.numel() for p in model_s.parameters())/1e6} million parameters")

print(model_n.model)



Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 11.6MB/s]


yolov8-nano: 3.1572 million parameters
DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): C2f(
      (cv1): Conv(
        (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
       

## Yolov8 architecture
The overall architecture is BackBone + Neck + Head

<center>
<img src="yolov8_architecture.png" width="700" height="700">

## 1. Backbone
The backbone is modified CSPDarknet53 which comprises of blocks Conv, C2f, SPPF
1. **Conv**: Conv2d + BatchNorm2d + SiLU 
2. **C2f (cross-stage partial bottleneck with 2 convolutions)**: Conv + Bottlenecks + Conv <br>
Combine high-level features with contextual information to improve detection accuracy.
3. **SPPF (spatial pyramid pooling fast)**: Conv + Maxpool2d + Conv <br>
Process features at various scales and pool them into a fixed-size feature map. 

### (a) Conv 

In [2]:
class Conv(nn.Module):
    def __init__(self,in_channels, out_channels,kernel_size=3,stride=1,padding=1,groups=1,activation=True):
        super().__init__()
        self.conv=nn.Conv2d(in_channels,out_channels,kernel_size,stride,padding,bias=False,groups=groups)
        self.bn=nn.BatchNorm2d(out_channels,eps=0.001,momentum=0.03)
        self.act=nn.SiLU(inplace=True) if activation else nn.Identity()

    def forward(self,x):
        return self.act(self.bn(self.conv(x)))

### (b) C2f

In [3]:
# 2.1 Bottleneck: staack of 2 COnv with shortcut connnection (True/False)
class Bottleneck(nn.Module):
    def __init__(self,in_channels,out_channels,shortcut=True):
        super().__init__()
        self.conv1=Conv(in_channels,out_channels,kernel_size=3,stride=1,padding=1)
        self.conv2=Conv(out_channels,out_channels,kernel_size=3,stride=1,padding=1)
        self.shortcut=shortcut

    def forward(self,x):
        x_in=x # for residual connection
        x=self.conv1(x)
        x=self.conv2(x)
        if self.shortcut:
            x=x+x_in
        return x
    
# 2.2 C2f: Conv + bottleneck*N+ Conv
class C2f(nn.Module):
    def __init__(self,in_channels,out_channels, num_bottlenecks,shortcut=True):
        super().__init__()
        
        self.mid_channels=out_channels//2
        self.num_bottlenecks=num_bottlenecks

        self.conv1=Conv(in_channels,out_channels,kernel_size=1,stride=1,padding=0)
        
        # sequence of bottleneck layers
        self.m=nn.ModuleList([Bottleneck(self.mid_channels,self.mid_channels) for _ in range(num_bottlenecks)])

        self.conv2=Conv((num_bottlenecks+2)*out_channels//2,out_channels,kernel_size=1,stride=1,padding=0)
    
    def forward(self,x):
        x=self.conv1(x)

        # split x along channel dimension
        x1,x2=x[:,:x.shape[1]//2,:,:], x[:,x.shape[1]//2:,:,:]
        
        # list of outputs
        outputs=[x1,x2] # x1 is fed through the bottlenecks

        for i in range(self.num_bottlenecks):
            x1=self.m[i](x1)    # [bs,0.5c_out,w,h]
            outputs.insert(0,x1)

        outputs=torch.cat(outputs,dim=1) # [bs,0.5c_out(num_bottlenecks+2),w,h]
        out=self.conv2(outputs)

        return out
         
# sanity check
c2f=C2f(in_channels=64,out_channels=128,num_bottlenecks=2)
print(f"{sum(p.numel() for p in c2f.parameters())/1e6} million parameters")

dummy_input=torch.rand((1,64,244,244))
dummy_input=c2f(dummy_input)
print("Output shape: ", dummy_input.shape)


0.18944 million parameters
Output shape:  torch.Size([1, 128, 244, 244])


### (c) SPPF

In [4]:
class SPPF(nn.Module):
    def __init__(self,in_channels,out_channels,kernel_size=5):
        #kernel_size= size of maxpool
        super().__init__()
        hidden_channels=in_channels//2
        self.conv1=Conv(in_channels,hidden_channels,kernel_size=1,stride=1,padding=0)
        # concatenate outputs of maxpool and feed to conv2
        self.conv2=Conv(4*hidden_channels,out_channels,kernel_size=1,stride=1,padding=0)

        # maxpool is applied at 3 different sacles
        self.m=nn.MaxPool2d(kernel_size=kernel_size,stride=1,padding=kernel_size//2,dilation=1,ceil_mode=False)
    
    def forward(self,x):
        x=self.conv1(x)

        # apply maxpooling at diffent scales
        y1=self.m(x)
        y2=self.m(y1)
        y3=self.m(y2)

        # concantenate 
        y=torch.cat([x,y1,y2,y3],dim=1)

        # final conv
        y=self.conv2(y)

        return y

# sanity check
sppf=SPPF(in_channels=128,out_channels=512)
print(f"{sum(p.numel() for p in sppf.parameters())/1e6} million parameters")

dummy_input=sppf(dummy_input)
print("Output shape: ", dummy_input.shape)


0.140416 million parameters
Output shape:  torch.Size([1, 512, 244, 244])


### Putting things together

In [5]:
# backbone = DarkNet53

# return d,w,r based on version
def yolo_params(version):
    if version=='n':
        return 1/3,1/4,2.0
    elif version=='s':
        return 1/3,1/2,2.0
    elif version=='m':
        return 2/3,3/4,1.5
    elif version=='l':
        return 1.0,1.0,1.0
    elif version=='x':
        return 1.0,1.25,1.0
    
class Backbone(nn.Module):
    def __init__(self,version,in_channels=3,shortcut=True):
        super().__init__()
        d,w,r=yolo_params(version)

        # conv layers
        self.conv_0=Conv(in_channels,int(64*w),kernel_size=3,stride=2,padding=1)
        self.conv_1=Conv(int(64*w),int(128*w),kernel_size=3,stride=2,padding=1)
        self.conv_3=Conv(int(128*w),int(256*w),kernel_size=3,stride=2,padding=1)
        self.conv_5=Conv(int(256*w),int(512*w),kernel_size=3,stride=2,padding=1)
        self.conv_7=Conv(int(512*w),int(512*w*r),kernel_size=3,stride=2,padding=1)

        # c2f layers
        self.c2f_2=C2f(int(128*w),int(128*w),num_bottlenecks=int(3*d),shortcut=True)
        self.c2f_4=C2f(int(256*w),int(256*w),num_bottlenecks=int(6*d),shortcut=True)
        self.c2f_6=C2f(int(512*w),int(512*w),num_bottlenecks=int(6*d),shortcut=True)
        self.c2f_8=C2f(int(512*w*r),int(512*w*r),num_bottlenecks=int(3*d),shortcut=True)

        # sppf
        self.sppf=SPPF(int(512*w*r),int(512*w*r))
    
    def forward(self,x):
        x=self.conv_0(x)
        x=self.conv_1(x)

        x=self.c2f_2(x)

        x=self.conv_3(x)

        out1=self.c2f_4(x) # keep for output

        x=self.conv_5(out1)

        out2=self.c2f_6(x) # keep for output

        x=self.conv_7(out2)
        x=self.c2f_8(x)
        out3=self.sppf(x)

        return out1,out2,out3

print("----Nano model -----")
backbone_n=Backbone(version='n')
print(f"{sum(p.numel() for p in backbone_n.parameters())/1e6} million parameters")

print("----Small model -----")
backbone_s=Backbone(version='s')
print(f"{sum(p.numel() for p in backbone_s.parameters())/1e6} million parameters")
        

        


----Nano model -----
1.272656 million parameters
----Small model -----
5.079712 million parameters


In [6]:
# sanity check
x=torch.rand((1,3,640,640))
out1,out2,out3=backbone_n(x)
print(out1.shape)
print(out2.shape)
print(out3.shape)

torch.Size([1, 64, 80, 80])
torch.Size([1, 128, 40, 40])
torch.Size([1, 256, 20, 20])


## 2. Neck
The neck comprises of Upsample + C2f with 

**Upsample** = nearest-neighbor interpolation with scale_factor=2. It doesn't have trainable paramaters.

In [7]:
# upsample = nearest-neighbor interpolation with scale_factor=2
#            doesn't have trainable paramaters
class Upsample(nn.Module):
    def __init__(self,scale_factor=2,mode='nearest'):
        super().__init__()
        self.scale_factor=scale_factor
        self.mode=mode

    def forward(self,x):
        return nn.functional.interpolate(x,scale_factor=self.scale_factor,mode=self.mode)
    
    


In [8]:
class Neck(nn.Module):
    def __init__(self,version):
        super().__init__()
        d,w,r=yolo_params(version)

        self.up=Upsample() # no trainable parameters
        self.c2f_1=C2f(in_channels=int(512*w*(1+r)), out_channels=int(512*w),num_bottlenecks=int(3*d),shortcut=False)
        self.c2f_2=C2f(in_channels=int(768*w), out_channels=int(256*w),num_bottlenecks=int(3*d),shortcut=False)
        self.c2f_3=C2f(in_channels=int(768*w), out_channels=int(512*w),num_bottlenecks=int(3*d),shortcut=False)
        self.c2f_4=C2f(in_channels=int(512*w*(1+r)), out_channels=int(512*w*r),num_bottlenecks=int(3*d),shortcut=False)

        self.cv_1=Conv(in_channels=int(256*w),out_channels=int(256*w),kernel_size=3,stride=2, padding=1)
        self.cv_2=Conv(in_channels=int(512*w),out_channels=int(512*w),kernel_size=3,stride=2, padding=1)


    def forward(self,x_res_1,x_res_2,x):    
        # x_res_1,x_res_2,x = output of backbone
        res_1=x              # for residual connection
        
        x=self.up(x)
        x=torch.cat([x,x_res_2],dim=1)

        res_2=self.c2f_1(x)  # for residual connection
        
        x=self.up(res_2)
        x=torch.cat([x,x_res_1],dim=1)

        out_1=self.c2f_2(x)

        x=self.cv_1(out_1)

        x=torch.cat([x,res_2],dim=1)
        out_2=self.c2f_3(x)

        x=self.cv_2(out_2)

        x=torch.cat([x,res_1],dim=1)
        out_3=self.c2f_4(x)

        return out_1,out_2,out_3
    
# sanity check
neck=Neck(version='n')
print(f"{sum(p.numel() for p in neck.parameters())/1e6} million parameters")

x=torch.rand((1,3,640,640))
out1,out2,out3=Backbone(version='n')(x)
out_1,out_2,out_3=neck(out1,out2,out3)
print(out_1.shape)
print(out_2.shape)
print(out_3.shape)



0.98688 million parameters
torch.Size([1, 64, 80, 80])
torch.Size([1, 128, 40, 40])
torch.Size([1, 256, 20, 20])


## 3. Head
Consist of 3 modules: (1) bbox coordinates, (2) classification scores, (3) distribution focal loss (DFL).

**DFL** considers the predicted bbox coordinates as a probability distribution. At inference time, it samples from the distribution to get **refined coordinates** $(x,y,w,h)$. For example, to predict coordinate $x$ in the normalized range $[0,1]$:
1. DFL uses 16 bins which are equally spaced in $[0,1]$, bin length = 1/16. 
2. The model outputs 16 numbers which corresponds to probabilities that x falls in these bins, for example, $[0,0,...,9/10,1/10]$.
3. Prediction for $x=$ mean value $= 9/10\cdot15/16+1/10\cdot 1=0.94375$ 


### (a) DFL

In [9]:
# DFL
class DFL(nn.Module):
    def __init__(self,ch=16):
        super().__init__()
        
        self.ch=ch
        
        self.conv=nn.Conv2d(in_channels=ch,out_channels=1,kernel_size=1,bias=False).requires_grad_(False)
        
        # initialize conv with [0,...,ch-1]
        x=torch.arange(ch,dtype=torch.float).view(1,ch,1,1)
        self.conv.weight.data[:]=torch.nn.Parameter(x) # DFL only has ch parameters

    def forward(self,x):
        # x must have num_channels = 4*ch: x=[bs,4*ch,c]
        b,c,a=x.shape                           # c=4*ch
        x=x.view(b,4,self.ch,a).transpose(1,2)  # [bs,ch,4,a]

        # take softmax on channel dimension to get distribution probabilities
        x=x.softmax(1)                          # [b,ch,4,a]
        x=self.conv(x)                          # [b,1,4,a]
        return x.view(b,4,a)                    # [b,4,a]

# sanity check
dummy_input=torch.rand((1,64,128))
dfl=DFL()
print(f"{sum(p.numel() for p in dfl.parameters())} parameters")

dummy_output=dfl(dummy_input)
print(dummy_output.shape)

print(dfl)





16 parameters
torch.Size([1, 4, 128])
DFL(
  (conv): Conv2d(16, 1, kernel_size=(1, 1), stride=(1, 1), bias=False)
)


### (b) Head

In [10]:
class Head(nn.Module):
    def __init__(self,version,ch=16,num_classes=80):

        super().__init__()
        self.ch=ch                          # dfl channels
        self.coordinates=self.ch*4          # number of bounding box coordinates 
        self.nc=num_classes                 # 80 for COCO
        self.no=self.coordinates+self.nc    # number of outputs per anchor box

        self.stride=torch.zeros(3)          # strides computed during build
        
        d,w,r=yolo_params(version=version)
        
        # for bounding boxes
        self.box=nn.ModuleList([
            nn.Sequential(Conv(int(256*w),self.coordinates,kernel_size=3,stride=1,padding=1),
                          Conv(self.coordinates,self.coordinates,kernel_size=3,stride=1,padding=1),
                          nn.Conv2d(self.coordinates,self.coordinates,kernel_size=1,stride=1)),

            nn.Sequential(Conv(int(512*w),self.coordinates,kernel_size=3,stride=1,padding=1),
                          Conv(self.coordinates,self.coordinates,kernel_size=3,stride=1,padding=1),
                          nn.Conv2d(self.coordinates,self.coordinates,kernel_size=1,stride=1)),

            nn.Sequential(Conv(int(512*w*r),self.coordinates,kernel_size=3,stride=1,padding=1),
                          Conv(self.coordinates,self.coordinates,kernel_size=3,stride=1,padding=1),
                          nn.Conv2d(self.coordinates,self.coordinates,kernel_size=1,stride=1))
        ])

        # for classification
        self.cls=nn.ModuleList([
            nn.Sequential(Conv(int(256*w),self.nc,kernel_size=3,stride=1,padding=1),
                          Conv(self.nc,self.nc,kernel_size=3,stride=1,padding=1),
                          nn.Conv2d(self.nc,self.nc,kernel_size=1,stride=1)),

            nn.Sequential(Conv(int(512*w),self.nc,kernel_size=3,stride=1,padding=1),
                          Conv(self.nc,self.nc,kernel_size=3,stride=1,padding=1),
                          nn.Conv2d(self.nc,self.nc,kernel_size=1,stride=1)),

            nn.Sequential(Conv(int(512*w*r),self.nc,kernel_size=3,stride=1,padding=1),
                          Conv(self.nc,self.nc,kernel_size=3,stride=1,padding=1),
                          nn.Conv2d(self.nc,self.nc,kernel_size=1,stride=1))
        ])

        # dfl
        self.dfl=DFL()

    def forward(self,x):
        # x = output of Neck = list of 3 tensors with different resolution and different channel dim
        #     x[0]=[bs, ch0, w0, h0], x[1]=[bs, ch1, w1, h1], x[2]=[bs,ch2, w2, h2] 

        for i in range(len(self.box)):       # detection head i
            box=self.box[i](x[i])            # [bs,num_coordinates,w,h]
            cls=self.cls[i](x[i])            # [bs,num_classes,w,h]
            x[i]=torch.cat((box,cls),dim=1)  # [bs,num_coordinates+num_classes,w,h]

        # in training, no dfl output
        if self.training:
            return x                         # [3,bs,num_coordinates+num_classes,w,h]
        
        # in inference time, dfl produces refined bounding box coordinates
        anchors, strides = (i.transpose(0, 1) for i in self.make_anchors(x, self.stride))

        # concatenate predictions from all detection layers
        x = torch.cat([i.view(x[0].shape[0], self.no, -1) for i in x], dim=2) #[bs, 4*self.ch + self.nc, sum_i(h[i]w[i])]
        
        # split out predictions for box and cls
        #           box=[bs,4×self.ch,sum_i(h[i]w[i])]
        #           cls=[bs,self.nc,sum_i(h[i]w[i])]
        box, cls = x.split(split_size=(4 * self.ch, self.nc), dim=1)


        a, b = self.dfl(box).chunk(2, 1)  # a=b=[bs,2×self.ch,sum_i(h[i]w[i])]
        a = anchors.unsqueeze(0) - a
        b = anchors.unsqueeze(0) + b
        box = torch.cat(tensors=((a + b) / 2, b - a), dim=1)
        
        return torch.cat(tensors=(box * strides, cls.sigmoid()), dim=1)


    def make_anchors(self, x, strides, offset=0.5):
        # x= list of feature maps: x=[x[0],...,x[N-1]], in our case N= num_detection_heads=3
        #                          each having shape [bs,ch,w,h]
        #    each feature map x[i] gives output[i] = w*h anchor coordinates + w*h stride values
        
        # strides = list of stride values indicating how much 
        #           the spatial resolution of the feature map is reduced compared to the original image

        assert x is not None
        anchor_tensor, stride_tensor = [], []
        dtype, device = x[0].dtype, x[0].device
        for i, stride in enumerate(strides):
            _, _, h, w = x[i].shape
            sx = torch.arange(end=w, device=device, dtype=dtype) + offset  # x coordinates of anchor centers
            sy = torch.arange(end=h, device=device, dtype=dtype) + offset  # y coordinates of anchor centers
            sy, sx = torch.meshgrid(sy, sx)                                # all anchor centers 
            anchor_tensor.append(torch.stack((sx, sy), -1).view(-1, 2))
            stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
        return torch.cat(anchor_tensor), torch.cat(stride_tensor)
        

In [11]:

detect=Head(version='n')
print(f"{sum(p.numel() for p in detect.parameters())/1e6} million parameters")

# out_1,out_2,out_3 are output of the neck
output=detect([out_1,out_2,out_3])
print(output[0].shape)
print(output[1].shape)
print(output[2].shape)

print(detect)


0.897664 million parameters
torch.Size([1, 144, 80, 80])
torch.Size([1, 144, 40, 40])
torch.Size([1, 144, 20, 20])
Head(
  (box): ModuleList(
    (0): Sequential(
      (0): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
    )
    (1): Sequential(
      (0): Conv(
        (conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
    

## 4. Putting everything together

In [12]:
class MyYolo(nn.Module):
    def __init__(self,version):
        super().__init__()
        self.backbone=Backbone(version=version)
        self.neck=Neck(version=version)
        self.head=Head(version=version)

    def forward(self,x):
        x=self.backbone(x)              # return out1,out2,out3
        x=self.neck(x[0],x[1],x[2])     # return out_1, out_2,out_3
        return self.head(list(x))
    
model=MyYolo(version='n')
print(f"{sum(p.numel() for p in model.parameters())/1e6} million parameters")
print(model)

3.1572 million parameters
MyYolo(
  (backbone): Backbone(
    (conv_0): Conv(
      (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (conv_1): Conv(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (conv_3): Conv(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (conv_5): Conv(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (

## 5. Dataset and Train
Due to computation constraint, we will only overfit 1 batch of data from YOLO2017. The loss in yolov8 consists  (1) box_loss, (2) classification_loss, (3) dfl_loss
1. **Box loss**: on bounding box coordinates
2. **Classification loss**: on classification logits
3. **DFL_Loss**: itegrated in box loss

**How DFL loss works?**
1. Target distribution = single value over 16 bins, this value will be smooth between 2 bin centers. For example, if target value is in the middle of bin 1 and 2, target distribution $= [1/2,1/2,0,...,0]$
2. Predicted distribution is based on the output of bbox coordinates. These outputs can be used to compute the probabilities for each bin, say $[1/16,1/16,...,1/16]$
3. DFL = cross entropy between prediction and target

The utils file for computing loss is taken from https://github.com/jahongir7174/YOLOv8-dfl/tree/master

In [13]:
def setNewPath(data_dir='./'):
    data_dir = data_dir
    # Initialize a list to store image and label file paths as tuples
    image_label_pairs = []
    # ... (rest of your code to get image file paths)
    # construct the full path to the file
    train2024_txt_path = os.path.join(data_dir, 'dset_train2024.txt')
    # check if the file exists, if not print an error message and exit
    if not os.path.exists(train2024_txt_path):
        print(f"Error: File not found: {train2024_txt_path}")
        # If necessary raise an exception or handle the error differently
        # raise FileNotFoundError(f"File not found: {train2017_txt_path}")
        exit()

    print(f"data_dir: {data_dir}")
    print(f"train2024_txt_path: {train2024_txt_path}")

    with open(train2024_txt_path, 'r') as reader:
      # Assuming label files are in a 'labels' subdirectory
      for filename in reader.readlines():
          filename = os.path.basename(filename.rstrip())

          # Debug: Print the filename to see what's being read
          print(f"Processing filename: {filename}")


          image_path = f'{data_dir}dataset/images/train/' + filename
          label_path = f'{data_dir}dataset/labels/train/' + filename.replace('.jpg', '.txt').replace('.png', '.txt').replace('.jpeg', '.txt')

          #print(f"image_path: {image_path}")
          #print(f"label_path: {label_path}")
           # Debug: Check if the image and label files exist
          if os.path.exists(image_path) and os.path.exists(label_path):
            image_label_pairs.append((image_path, label_path))
          else:
            print(f"Warning: Image or label file not found:\nImage: {image_path}\nLabel: {label_path}")
    print(f"Found {len(image_label_pairs)} image-label pairs.") # Print the length
    return image_label_pairs

In [14]:
images_labels = setNewPath(data_dir='./')
#print(images_labels)

data_dir: ./
train2024_txt_path: ./dset_train2024.txt
Processing filename: 000000001591_jpg.rf.6707cd1cdfcde959a680bbeae4d360a0.jpg
Image: ./dataset/images/train/000000001591_jpg.rf.6707cd1cdfcde959a680bbeae4d360a0.jpg
Label: ./dataset/labels/train/000000001591_jpg.rf.6707cd1cdfcde959a680bbeae4d360a0.txt
Processing filename: 000000005340_jpg.rf.930ac45b0b9e7103562167dc8e408e1b.jpg
Image: ./dataset/images/train/000000005340_jpg.rf.930ac45b0b9e7103562167dc8e408e1b.jpg
Label: ./dataset/labels/train/000000005340_jpg.rf.930ac45b0b9e7103562167dc8e408e1b.txt
Processing filename: 000000004244_jpg.rf.672775c4b8cf6a8d540dbb51f307e443.jpg
Image: ./dataset/images/train/000000004244_jpg.rf.672775c4b8cf6a8d540dbb51f307e443.jpg
Label: ./dataset/labels/train/000000004244_jpg.rf.672775c4b8cf6a8d540dbb51f307e443.txt
Processing filename: 000000004282_jpg.rf.0d469aaa6297b76cfa848b651b54ee27.jpg
Image: ./dataset/images/train/000000004282_jpg.rf.0d469aaa6297b76cfa848b651b54ee27.jpg
Label: ./dataset/labels/t

In [None]:
import yaml
import os
import torch.utils.data as data
from utils.datasetClass import *


# get params from yaml file
with open('utils/args.yaml', errors='ignore') as f:
    params_dict = yaml.safe_load(f)  # Store original params in params_dict

# Access label names using params_dict
params_names = params_dict["names"]
# **Check if 'mosaic' key exists and handle it**
if 'mosaic' not in params_dict:
    # If 'mosaic' key is missing, set it to a default value (e.g., False)
    params_dict['mosaic'] = False  # Or any other default value you prefer
    print("Warning: 'mosaic' key not found in YAML. Setting it to False.")

labelnames = []
imagenames = []
image_label_pairs = images_labels
#print(f"Image-label pairs: {image_label_pairs}")  # Print the output of setNewPath

# Check if image_label_pairs is empty and handle the case
if not image_label_pairs:
    print("Error: No image-label pairs found. Check data_dir and setNewPath function.")
    # You might want to raise an exception here to stop execution
    raise ValueError("No image-label pairs found.") # Raise a ValueError to halt execution
else:
    for image_path, label_path in image_label_pairs:
        labelnames.append(label_path)
        imagenames.append(image_path)

print(f"Number of image names: {len(imagenames)}")
print(f"Number of label names: {len(labelnames)}")
input_size = 640
# Ensure that 'filenames' is being correctly assigned within your Mydataset class
train_data = Dataset(input_size=input_size, params=params_dict, augment=False, filenames=imagenames)
# Assuming 'filenames' is an attribute of Mydataset, make sure it's initialized

# Print the value of train_data.filenames to check if it's empty  -- Changed from train_datas to train_data
print(f"train_data.filenames: {train_data.filenames}")

# Check if the 'filenames' attribute of train_data is empty -- Changed from train_datas to train_data
if not train_data.filenames:
    print("Error: 'filenames' in train_data is empty. Check setNewPath or Dataset initialization.")
    # This will help pinpoint if the issue is with the file paths themselves
    # Print the first few elements of 'imagenames' for debugging
    print(f"First few imagenames: {imagenames[:5]}")  # Print first 5 for brevity
    raise ValueError("'filenames' in train_data is empty")

# Check if train_data has any samples before creating DataLoader -- Changed from train_datas to train_data
if len(train_data) == 0:
    print("Error: train_data is empty. Check your Dataset implementation.")
    # Consider raising an exception or fixing the Mydataset class
    raise ValueError("train_data is empty") # Raise a ValueError to halt execution if the dataset is empty
else:
    train_loader = data.DataLoader(train_data, batch_size=64, shuffle=True, num_workers=0, pin_memory=True, collate_fn=Dataset.collate_fn)
    print(f"Train_loader : {len(train_loader)} batches")

Error: No image-label pairs found. Check data_dir and setNewPath function.


ValueError: No image-label pairs found.

In [15]:
import yaml
from utils.dataset import *
from torch.utils import data

# get all file names
data_dir= 'coco'
filenames_train = []
with open(f'{data_dir}/train2017.txt') as reader:
    for filename in reader.readlines():
        filename = os.path.basename(filename.rstrip())
        filenames_train.append(f'{data_dir}/images/train2017/' + filename)

# input_size for the model
input_size=640

# get params from yaml file
with open('utils/args.yaml', errors='ignore') as f:
        params = yaml.safe_load(f)

train_data=Dataset(filenames_train,input_size,params,augment=True)
train_loader = data.DataLoader(train_data, batch_size=64, num_workers=0, pin_memory=True, collate_fn=Dataset.collate_fn)
print(f"Train_loader : {len(train_loader)} batches")


Train_loader : 1849 batches


In [16]:
batch=next(iter(train_loader))
print("All keys in batch      : ", batch[1].keys())
print(f"Input batch shape      : ", batch[0].shape)
print(f"Classification scores  : {batch[1]['cls'].shape}")
print(f"Box coordinates        : {batch[1]['box'].shape}")
print(f"Index identifier (which score belongs to which image): {batch[1]['idx'].shape}")

[ WARN:0@181.193] global loadsave.cpp:241 findDecoder imread_('coco/images/train2017/000000211830.jpg'): can't open/read file: check file path/integrity


AttributeError: 'NoneType' object has no attribute 'shape'

In [29]:
from utils import util
torch.manual_seed(1337)

# model, loss and optimizer
model=MyYolo(version='n')
print(f"{sum(p.numel() for p in model.parameters())/1e6} million parameters")

criterion=util.ComputeLoss(model, params)
optimizer=torch.optim.AdamW(model.parameters(), lr=0.5)

num_epochs=5

imgs,targets=batch[0],batch[1]
imgs=imgs.float()
model.train()
for epoch in range(num_epochs):
    outputs=model(imgs)
    loss=sum(criterion(outputs,targets)) # cls_loss+box_loss+dfl_loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch : {epoch} | loss : {loss.item()}")

    


3.1572 million parameters


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Epoch : 0 | loss : 15265294.0
Epoch : 1 | loss : 2260031.0
Epoch : 2 | loss : 1407.5379638671875
Epoch : 3 | loss : 0.0
Epoch : 4 | loss : 0.0
