In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

def verify_dataset(images_dir, labels_dir):
    image_files = set(file for file in os.listdir(images_dir) if file.endswith('.jpg'))
    label_files = set(file.replace('.txt', '.jpg') for file in os.listdir(labels_dir) if file.endswith('.txt'))

    unmatched_images = image_files - label_files
    unmatched_labels = {name.replace('.jpg', '.txt') for name in (label_files - image_files)}

    print(f"Total images in {images_dir}: {len(image_files)}")
    print(f"Total labels in {labels_dir}: {len(label_files)}")
    if unmatched_images:
        print(f"Unmatched images: {unmatched_images}")
    if unmatched_labels:
        print(f"Unmatched labels: {unmatched_labels}")

base_dir = '/content/drive/My Drive/archive-5'
train_images_dir = os.path.join(base_dir, 'train', 'images')
train_labels_dir = os.path.join(base_dir, 'train', 'labels')
valid_images_dir = os.path.join(base_dir, 'valid', 'images')
valid_labels_dir = os.path.join(base_dir, 'valid', 'labels')
test_images_dir = os.path.join(base_dir, 'test', 'images')
test_labels_dir = os.path.join(base_dir, 'test', 'labels')

verify_dataset(train_images_dir, train_labels_dir)
verify_dataset(valid_images_dir, valid_labels_dir)
verify_dataset(test_images_dir, test_labels_dir)


Total images in /content/drive/My Drive/archive-5/train/images: 4200
Total labels in /content/drive/My Drive/archive-5/train/labels: 4200
Total images in /content/drive/My Drive/archive-5/valid/images: 1704
Total labels in /content/drive/My Drive/archive-5/valid/labels: 1704
Total images in /content/drive/My Drive/archive-5/test/images: 100
Total labels in /content/drive/My Drive/archive-5/test/labels: 100


In [None]:
from collections import defaultdict

def analyze_class_distribution(labels_dir):
    class_counts = defaultdict(int)
    label_files = [f for f in os.listdir(labels_dir) if f.endswith('.txt')]

    for file_name in label_files:
        file_path = os.path.join(labels_dir, file_name)
        with open(file_path, 'r') as file:
            for line in file.readlines():
                class_id = int(line.split()[0])
                class_counts[class_id] += 1

    return class_counts

print("Training Set Class Distribution:")
print(analyze_class_distribution(train_labels_dir))
print("\nValidation Set Class Distribution:")
print(analyze_class_distribution(valid_labels_dir))
print("\nTest Set Class Distribution:")
print(analyze_class_distribution(test_labels_dir))


Training Set Class Distribution:
defaultdict(<class 'int'>, {12: 2228, 17: 1419, 1: 1318, 2: 821, 5: 662, 6: 2214, 4: 714, 15: 371, 9: 356, 7: 583, 16: 304, 0: 142, 10: 751, 8: 252, 3: 376, 11: 369, 14: 225, 13: 145})

Validation Set Class Distribution:
defaultdict(<class 'int'>, {7: 186, 14: 125, 12: 854, 13: 90, 4: 267, 1: 459, 8: 93, 5: 263, 11: 178, 17: 569, 6: 565, 2: 320, 10: 265, 0: 62, 15: 120, 16: 113, 9: 178, 3: 123})

Test Set Class Distribution:
defaultdict(<class 'int'>, {12: 59, 10: 28, 17: 38, 1: 26, 5: 30, 4: 19, 14: 9, 7: 19, 13: 2, 15: 12, 6: 55, 16: 6, 11: 10, 9: 7, 0: 6, 8: 4, 3: 51, 2: 13})


In [None]:
import torch
import torch.nn as nn

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

class YOLOv7(nn.Module):
    def __init__(self, in_channels=3, num_classes=17):
        super().__init__()
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.layers = self._create_layers()

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def _create_layers(self):
        layers = []
        in_channels = self.in_channels

        architecture = [
            (7, 64, 2, 3), "M",
            (3, 192, 1, 1), "M",
            (1, 128, 1, 0), (3, 256, 1, 1), (1, 256, 1, 0), (3, 512, 1, 1), "M",
            [(1, 256, 1, 0), (3, 512, 1, 1), 4],
            (1, 512, 1, 0), (3, 1024, 1, 1), "M",
            [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
            (3, 1024, 1, 1), (3, 1024, 2, 1), (3, 1024, 1, 1), (3, 1024, 1, 1)
        ]

        for x in architecture:
            if type(x) == tuple:
                layers.append(CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3]))
                in_channels = x[1]
            elif type(x) == str:
                layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
            elif type(x) == list:
                conv1, conv2, num_repeats = x
                for _ in range(num_repeats):
                    layers.append(CNNBlock(in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3]))
                    layers.append(CNNBlock(conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3]))
                    in_channels = conv2[1]

        layers.append(nn.Flatten())
        # Calculating the output size assuming the input image size is divisible by 32
        output_grid_size = 7  # This is a simplification, the actual grid size depends on input dimensions
        layers.append(nn.Linear(1024 * output_grid_size * output_grid_size, output_grid_size * output_grid_size * (self.num_classes + 5 * 2)))  # B=2

        return nn.Sequential(*layers)

# Example use
model = YOLOv7()
print(model)


YOLOv7(
  (layers): Sequential(
    (0): CNNBlock(
      (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leakyrelu): LeakyReLU(negative_slope=0.1)
    )
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): CNNBlock(
      (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leakyrelu): LeakyReLU(negative_slope=0.1)
    )
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): CNNBlock(
      (conv): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leakyrelu): LeakyReLU(negative_slope=0.1)
    )
    (5): CNNBlock

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

class YOLOv7(nn.Module):
    def __init__(self, in_channels=3, num_classes=17):
        super().__init__()
        self.layers = nn.Sequential(
            CNNBlock(in_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.MaxPool2d(2, 2),
            CNNBlock(64, 192, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(192, 128, kernel_size=1),
            CNNBlock(128, 256, kernel_size=3, padding=1),
            CNNBlock(256, 256, kernel_size=1),
            CNNBlock(256, 512, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(512, 1024, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(1024, 1024, kernel_size=3, padding=1),
            CNNBlock(1024, 1024, kernel_size=3, padding=1),
            nn.Flatten()
        )
        self.fc = self._create_fcs(1024, num_classes)

    def _create_fcs(self, final_conv_out_channels, num_classes):
        dummy_input = torch.zeros(1, 3, 416, 416)
        with torch.no_grad():
            dummy_output = self.layers(dummy_input)
        flattened_size = dummy_output.shape[1]
        return nn.Sequential(
            nn.Linear(flattened_size, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, 7*7*(num_classes + 5*2))  # for grid 7x7, 2 boxes per grid, and num_classes classes
        )

    def forward(self, x):
        x = self.layers(x)
        return self.fc(x)

class CustomDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform
        self.images = [os.path.join(image_dir, img) for img in os.listdir(image_dir) if img.endswith('.jpg')]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        label_path = os.path.join(self.label_dir, img_path.split('/')[-1].replace('.jpg', '.txt'))
        image = Image.open(img_path).convert('RGB')
        boxes = []
        with open(label_path, 'r') as f:
            for line in f:
                boxes.append(list(map(float, line.strip().split())))
        if self.transform:
            image = self.transform(image)
        boxes = torch.tensor(boxes)
        return image, boxes

def collate_fn(batch):
    images, targets = zip(*batch)
    images = torch.stack(images)
    targets = [torch.as_tensor(t, dtype=torch.float32) for t in targets]
    targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=-1)
    return images, targets

transform = transforms.Compose([
    transforms.Resize((416, 416)),
    transforms.ToTensor(),
])

base_dir = '/content/drive/My Drive/archive-5'
train_images_dir = os.path.join(base_dir, 'train', 'images')
train_labels_dir = os.path.join(base_dir, 'train', 'labels')
train_dataset = CustomDataset(train_images_dir, train_labels_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

model = YOLOv7(num_classes=17)
model.train()

optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, loader, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        for images, targets in loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = yolo_loss(outputs, targets, model)
            loss.backward()
            optimizer.step()
            print(f'Epoch {epoch+1}: Loss {loss.item()}')

def yolo_loss(outputs, targets, model):
    return ((outputs - targets)**2).mean()  # Simplified for now, needs proper implementation

train(model, train_loader, optimizer)


RuntimeError: The size of tensor a (1323) must match the size of tensor b (5) at non-singleton dimension 2

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

class YOLOv7(nn.Module):
    def __init__(self, in_channels=3, num_classes=17):
        super().__init__()
        self.layers = nn.Sequential(
            CNNBlock(in_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.MaxPool2d(2, 2),
            CNNBlock(64, 192, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(192, 128, kernel_size=1),
            CNNBlock(128, 256, kernel_size=3, padding=1),
            CNNBlock(256, 256, kernel_size=1),
            CNNBlock(256, 512, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(512, 1024, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(1024, 1024, kernel_size=3, padding=1),
            CNNBlock(1024, 1024, kernel_size=3, padding=1),
            nn.Flatten()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(173056, 4096),  # Adjusted based on the debug print
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, 7*7*(num_classes + 5*2))  # Grid size * Grid size * (Classes + 5 * Bounding boxes)
        )

    def forward(self, x):
        x = self.layers(x)
        print(f"Shape before FC layers: {x.shape}")  # Debug statement
        x = self.fc_layers(x)
        return x

class CustomDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform
        self.images = [os.path.join(image_dir, img) for img in os.listdir(image_dir) if img.endswith('.jpg')]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        label_path = os.path.join(self.label_dir, img_path.split('/')[-1].replace('.jpg', '.txt'))
        image = Image.open(img_path).convert('RGB')
        labels = []
        with open(label_path, 'r') as f:
            for line in f:
                labels.append(list(map(float, line.strip().split())))
        if self.transform:
            image = self.transform(image)
        return image, labels

def collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images)
    max_boxes = max(len(label) for label in labels)
    padded_labels = torch.zeros((len(labels), max_boxes, 5))  # Assume 5 elements per label
    for i, label in enumerate(labels):
        if label:
            padded_labels[i, :len(label)] = torch.tensor(label)
    return images, padded_labels

transform = transforms.Compose([
    transforms.Resize((416, 416)),
    transforms.ToTensor(),
])

base_dir = '/content/drive/My Drive/archive-5'
train_images_dir = os.path.join(base_dir, 'train', 'images')
train_labels_dir = os.path.join(base_dir, 'train', 'labels')
train_dataset = CustomDataset(train_images_dir, train_labels_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

model = YOLOv7(num_classes=17)
model.train()

optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, loader, optimizer, epochs=5):
    for epoch in range(epochs):
        for images, targets in loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = ((outputs - targets)**2).mean()  # Placeholder for actual YOLO loss
            loss.backward()
            optimizer.step()
            print(f'Epoch {epoch+1}: Loss {loss.item()}')

train(model, train_loader, optimizer)


Shape before FC layers: torch.Size([16, 173056])


RuntimeError: The size of tensor a (1323) must match the size of tensor b (5) at non-singleton dimension 2

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

class YOLOv7(nn.Module):
    def __init__(self, in_channels=3, num_classes=17):
        super().__init__()
        self.layers = nn.Sequential(
            CNNBlock(in_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.MaxPool2d(2, 2),
            CNNBlock(64, 192, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(192, 128, kernel_size=1),
            CNNBlock(128, 256, kernel_size=3, padding=1),
            CNNBlock(256, 256, kernel_size=1),
            CNNBlock(256, 512, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(512, 1024, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            CNNBlock(1024, 1024, kernel_size=3, padding=1),
            CNNBlock(1024, 1024, kernel_size=3, padding=1),
            nn.Flatten()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(1024 * 6 * 6, 4096),  # Based on flattened output from previous statement
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, 7*7*(num_classes + 5*2))  # For 7x7 grid, num_classes, and 5 bbox attributes
        )

    def forward(self, x):
        x = self.layers(x)
        x = self.fc_layers(x)
        return x.view(-1, 7, 7, num_classes + 5*2)  # Reshape to match target format

class CustomDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform
        self.images = [os.path.join(image_dir, img) for img in os.listdir(image_dir) if img.endswith('.jpg')]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        label_path = os.path.join(self.label_dir, img_path.split('/')[-1].replace('.jpg', '.txt'))
        image = Image.open(img_path).convert('RGB')
        labels = []
        with open(label_path, 'r') as f:
            for line in f:
                labels.append(list(map(float, line.strip().split())))
        if self.transform:
            image = self.transform(image)
        return image, labels

def collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images)
    max_boxes = max(len(label) for label in labels)
    padded_labels = torch.zeros((len(labels), 7, 7, 5 + 17))  # Adjust for grid cells and class counts
    for i, label_list in enumerate(labels):
        for label in label_list:
            class_idx, x, y, w, h = label  # Assuming label format is [class_idx, x, y, w, h]
            grid_x = int(x * 7)
            grid_y = int(y * 7)
            padded_labels[i, grid_y, grid_x, :5] = torch.tensor([x, y, w, h, 1])
            padded_labels[i, grid_y, grid_x, 5 + int(class_idx)] = 1

    return images, padded_labels.view(-1, 7*7*(17 + 5))  # Flatten to match the output shape

transform = transforms.Compose([
    transforms.Resize((416, 416)),
    transforms.ToTensor(),
])

base_dir = '/content/drive/My Drive/archive-5'
train_images_dir = os.path.join(base_dir, 'train', 'images')
train_labels_dir = os.path.join(base_dir, 'train', 'labels')
train_dataset = CustomDataset(train_images_dir, train_labels_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

model = YOLOv7(num_classes=17)
model.train()

optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, loader, optimizer, epochs=5):
    for epoch in range(epochs):
        for images, targets in loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = ((outputs - targets)**2).mean()  # Placeholder for actual YOLO loss
            loss.backward()
            optimizer.step()
            print(f'Epoch {epoch+1}: Loss {loss.item()}')

train(model, train_loader, optimizer)


IndexError: index 22 is out of bounds for dimension 3 with size 22

In [None]:
|import os
from PIL import Image

base_dir = '/content/drive/My Drive/archive-5'
subsets = ['train', 'valid', 'test']

# Check for any non-image or non-label files and missing pairs
for subset in subsets:
    images_dir = os.path.join(base_dir, subset, 'images')
    labels_dir = os.path.join(base_dir, subset, 'labels')
    images = set(os.listdir(images_dir))
    labels = set(os.listdir(labels_dir))

    # Check for non-JPG images or non-txt label files
    non_jpgs = [img for img in images if not img.endswith('.jpg')]
    non_txts = [label for label in labels if not label.endswith('.txt')]

    # Check for missing corresponding pairs
    image_without_label = [img for img in images if img.replace('.jpg', '.txt') not in labels]
    label_without_image = [label for label in labels if label.replace('.txt', '.jpg') not in images]

    print(f"{subset.upper()} SET:")
    print(f"Non-JPG files: {non_jpgs}")
    print(f"Non-txt label files: {non_txts}")
    print(f"Images without labels: {image_without_label}")
    print(f"Labels without images: {label_without_image}")
    print('-'*50)

# Optionally, check image dimensions
image_dimensions = set()
for image_file in os.listdir(os.path.join(base_dir, 'train', 'images')):
    with Image.open(os.path.join(base_dir, 'train', 'images', image_file)) as img:
        image_dimensions.add(img.size)

print("Unique image dimensions:", image_dimensions)


TRAIN SET:
Non-JPG files: []
Non-txt label files: []
Images without labels: []
Labels without images: []
--------------------------------------------------
VALID SET:
Non-JPG files: []
Non-txt label files: []
Images without labels: []
Labels without images: []
--------------------------------------------------
TEST SET:
Non-JPG files: []
Non-txt label files: []
Images without labels: []
Labels without images: []
--------------------------------------------------


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn

# Function to calculate the output size of each layer
def calculate_output_size(model, input_size):
    with torch.no_grad():  # No need to track gradients here
        input_tensor = torch.rand(1, *input_size)  # Generate a random input tensor
        output = model(input_tensor)
        return output.shape  # Return the shape of the output

# Assuming your model is defined as follows
class TestModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(192, 128, kernel_size=1),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.Conv2d(256, 256, kernel_size=1),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.Flatten()
        )

    def forward(self, x):
        return self.layers(x)

# Create an instance of the model
model = TestModel()

# Calculate and print the output size
output_size = calculate_output_size(model, (3, 416, 416))  # Assuming the input images are 416x416
print("Output size:", output_size)


Output size: torch.Size([1, 173056])


In [None]:
import os
from PIL import Image

def calculate_average_image_size(image_dir):
    widths, heights = [], []
    image_files = [os.path.join(image_dir, img) for img in os.listdir(image_dir) if img.endswith('.jpg')]
    for image_file in image_files:
        with Image.open(image_file) as img:
            width, height = img.size
            widths.append(width)
            heights.append(height)
    avg_width = sum(widths) / len(widths)
    avg_height = sum(heights) / len(heights)
    return avg_width, avg_height

# Set the base directory for your dataset
base_dir = '/content/drive/My Drive/archive-5'
train_images_dir = os.path.join(base_dir, 'train', 'images')

# Calculate and print the average image size
average_width, average_height = calculate_average_image_size(train_images_dir)
print(f'Average Image Width: {average_width}, Average Image Height: {average_height}')


Average Image Width: 416.0, Average Image Height: 416.0


In [None]:
# First, simulate the forwarding to get the output dimension
def get_conv_output_dim(input_dim, kernel_size, padding, stride):
    output_dim = (input_dim + 2 * padding - (kernel_size - 1) - 1) // stride + 1
    return output_dim

# Given your model architecture and initial input size of 416x416:
input_dim = 416
layer_configs = [
    (7, 2, 3),  # kernel_size, stride, padding
    (2, 2, 0),  # pooling
    (3, 1, 1),
    (2, 2, 0),  # pooling
    (1, 1, 0),
    (3, 1, 1),
    (1, 1, 0),
    (3, 1, 1),
    (2, 2, 0),  # pooling
    (1, 1, 0),
    (3, 1, 1),
    (1, 1, 0),
    (3, 1, 1),
    (2, 2, 0),  # pooling
    (3, 1, 1),
    (1, 1, 0),
    (3, 1, 1),
    (3, 1, 1),
    (2, 2, 0),  # pooling
    (3, 1, 1)
]

for kernel_size, stride, padding in layer_configs:
    if kernel_size == 2:  # pooling layer
        input_dim = get_conv_output_dim(input_dim, kernel_size, padding, stride)
    else:  # convolution layer
        input_dim = get_conv_output_dim(input_dim, kernel_size, padding, stride)

print("Final output dimension per dimension:", input_dim)  # This should give you the size per dimension (height or width)


Final output dimension per dimension: 6
