## Imports

In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)

from transformers import AutoImageProcessor, Swinv2Model
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

2.5.1
True
11.8


  from .autonotebook import tqdm as notebook_tqdm


## Load CBD dataset (for training)

### Transformation
 - resize to 244,244
 - normalized based on calculations from training set
 - converted to tensors

In [2]:
import sys
sys.path.append('../dataset')
from dataset_parser import XMLDataset
import albumentations
import numpy as np
import os
from PIL import Image


  check_for_updates()


In [3]:
#calculation mean and std of training set for normalization

def compute_dataset_stats(image_dir):
    mean = np.zeros(3)
    std = np.zeros(3)
    total_pixels = 0

    # List all files in the directory
    image_filenames = sorted(os.listdir(image_dir))

    for img_filename in image_filenames:
        img_path = os.path.join(image_dir, img_filename)

        # Open the image
        image = Image.open(img_path).convert("RGB")
        image = np.array(image) / 255.0  # Convert to range [0, 1]

        # Compute mean and std
        mean += image.mean(axis=(0, 1))
        std += image.std(axis=(0, 1))
        total_pixels += 1

    # Compute average mean and std across all images
    mean /= total_pixels
    std /= total_pixels
    return mean.tolist(), std.tolist()

# Example usage
image_dir = '../dataset/train_img'
mean, std = compute_dataset_stats(image_dir)
print("Mean:", mean)
print("Std:", std)


Mean: [0.9362358181861065, 0.9420508144749817, 0.9394349808120092]
Std: [0.16755679634617304, 0.15510376581525342, 0.1576581799256157]


In [13]:
from albumentations.pytorch import ToTensorV2

#apply transformations to image and bounding boxes

transform = albumentations.Compose([
    albumentations.Resize(224, 224),
    albumentations.Normalize(mean=mean, std=std),
    ToTensorV2()
    ], bbox_params=albumentations.BboxParams(format='pascal_voc',  label_fields=['category']))



In [14]:
label_map = {
    "text" : 1,
    "arrow" : 2,
    "connection" : 3,
    "data": 4,
    "decision": 5,
    "process" : 6,
    "terminator" : 7
}

train = XMLDataset(image_dir='../dataset/train_img', annotation_dir='../dataset/xml_files/train', label_map=label_map, transform=transform)
test = XMLDataset(image_dir='../dataset/test_img', annotation_dir='../dataset/xml_files/test', label_map=label_map, transform=transform)
validation = XMLDataset(image_dir='../dataset/val_img', annotation_dir='../dataset/xml_files/val', label_map=label_map, transform=transform)
data = train[0]
print(data)

(tensor([[[0.0763, 0.0763, 0.0763,  ..., 0.0763, 0.0763, 0.0763],
         [0.0763, 0.0763, 0.0763,  ..., 0.0763, 0.0763, 0.0763],
         [0.0763, 0.0763, 0.0763,  ..., 0.0763, 0.0763, 0.0763],
         ...,
         [0.0763, 0.0763, 0.0763,  ..., 0.0763, 0.0763, 0.0763],
         [0.0763, 0.0763, 0.0763,  ..., 0.0763, 0.0763, 0.0763],
         [0.0763, 0.0763, 0.0763,  ..., 0.0763, 0.0763, 0.0763]],

        [[0.0449, 0.0449, 0.0449,  ..., 0.0449, 0.0449, 0.0449],
         [0.0449, 0.0449, 0.0449,  ..., 0.0449, 0.0449, 0.0449],
         [0.0449, 0.0449, 0.0449,  ..., 0.0449, 0.0449, 0.0449],
         ...,
         [0.0449, 0.0449, 0.0449,  ..., 0.0449, 0.0449, 0.0449],
         [0.0449, 0.0449, 0.0449,  ..., 0.0449, 0.0449, 0.0449],
         [0.0449, 0.0449, 0.0449,  ..., 0.0449, 0.0449, 0.0449]],

        [[0.0608, 0.0608, 0.0608,  ..., 0.0608, 0.0608, 0.0608],
         [0.0608, 0.0608, 0.0608,  ..., 0.0608, 0.0608, 0.0608],
         [0.0608, 0.0608, 0.0608,  ..., 0.0608, 0.0608, 0

In [15]:
# assert data['objects']['bbox'][0].tolist() == [201., 31., 211., 92.], "First box coordinates do not match"
(image, target) = data
# assert target['labels'][:5] == [7, 6, 6, 6, 5], "First five labels do not match"
assert len(target['area']) == len(target['labels']), "Length of area and category do not match"
assert len(target['boxes']) == len(target['area']), "Length of bbox and area do not match"
# print("new image size", image.shape)



print("Assertions passed successfully!")

Assertions passed successfully!


## Load FCA dataset (for testing)

## Load Swin Transformer

In [16]:
from transformers import AutoFeatureExtractor
# auto image processor does auto image transformation + normalization 
# image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window16-256")
swin_backbone = AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window16-256")

In [17]:
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
import torchvision

swin_backbone.out_channels = 768
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),), aspect_ratios=((0.5, 1.0, 2.0),))
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], output_size = 7, sampling_ratio=2)
model = FasterRCNN(
    backbone=swin_backbone,
    #7 + 1 for background
    num_classes=8,
    rpn_anchor_generator=anchor_generator,
    box_roi_pool=roi_pooler)


In [18]:
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate

# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# def collate_fn(batch):
#     images, targets = zip(*batch)
    
#     # Pad the bounding boxes and labels to the same length
#     max_boxes = max(len(target['boxes']) for target in targets)
    
#     for target in targets:
#         num_boxes = len(target['boxes'])
#         padding = max_boxes - num_boxes
#         if padding > 0:
#             target['boxes'] = torch.cat([target['boxes'], torch.zeros(padding, 4)], dim=0)
#             target['labels'] = torch.cat([target['labels'], torch.zeros(padding)], dim=0)  # Assuming labels are integer values
    
#     return default_collate(images), targets

def collate_fn(batch):
    return tuple(zip(*batch))


# define training and validation data loaders
data_loader = DataLoader(
    train,
    batch_size=4,
    shuffle=True,
    collate_fn=collate_fn
)

data_loader_val = DataLoader(
    validation,
    batch_size=4,
    shuffle=False,
    collate_fn=collate_fn
)


In [19]:
params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)


In [20]:
#edited from pytorch documentation
def train_one_epoch(model, optimizer, data_loader,  epoch, lr_scheduler):
    try: 
        model.train()
        header = f"Epoch: [{epoch}]"

        for i, data in enumerate(data_loader):
            images, labels = data
            
            loss_dict = model(images, labels)
            losses = sum(loss for loss in loss_dict.values())


            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            lr_scheduler.step()
            
            # Gather data and report
            running_loss += losses.item()
            if i % 1000 == 999:
                last_loss = running_loss / 1000 # loss per batch
                print(header)
                print('  batch {} loss: {}'.format(i + 1, last_loss))
                running_loss = 0.
    except Exception as e:
        # Print the error and the image_id that caused it
        # print(f"Error for image name {labels['image_id']}")
        print(data)
        print(f"Error: {str(e)}")
        # You can return None or raise the error depending on your need
        raise e
        
    return last_loss



In [21]:
# Initializing in a separate cell so we can easily add more epochs to the same run
import datetime
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch= epoch_number, model=model, data_loader=data_loader, optimizer=optimizer, lr_scheduler=lr_scheduler)

    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(data_loader_val):
            vinputs, vlabels = vdata
            print(vdata)
            loss_dict = model(vinputs, vlabels)
            vloss = sum(loss for loss in loss_dict.values())
            running_vloss += vloss.item()

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))


    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1
  
        

EPOCH 1:
((tensor([[[0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         ...,
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806]],

        [[0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         ...,
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736]],

        [[0.3842, 0.3842, 0.3842,  ..., 0.3842, 0.3842, 0.3842],
         [0.3842, 0.3842, 0.3842,  ..., 0.3842, 0.3842, 0.3842],
         [0.3842, 0.3842, 0.3842,  ..., 0.3842,

ValueError: The image to be converted to a PIL image contains values outside the range [0, 1], got [-27.73265838623047, -0.09709371626377106] which cannot be converted to uint8.

In [None]:
print(validation[0])

(tensor([[[0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         ...,
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806],
         [0.3806, 0.3806, 0.3806,  ..., 0.3806, 0.3806, 0.3806]],

        [[0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         ...,
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736],
         [0.3736, 0.3736, 0.3736,  ..., 0.3736, 0.3736, 0.3736]],

        [[0.3842, 0.3842, 0.3842,  ..., 0.3842, 0.3842, 0.3842],
         [0.3842, 0.3842, 0.3842,  ..., 0.3842, 0.3842, 0.3842],
         [0.3842, 0.3842, 0.3842,  ..., 0.3842, 0.3842, 0