In [1]:
# Auto reloads modules when they change
%load_ext autoreload
%autoreload 2

```Flow```
1. EDA
2. Decide model to use
3. Prepare dataset to fit model format and do image augmentations
4. Do a train/test split
5. Use dataloader to split dataset into batches for feeding into model
6. Initialize model
7. Train model
8. Validate model
9. Inference (predict on test.csv)

In [None]:
from src.dataset import wheatDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import torch
from torchvision.models.detection import (
    fasterrcnn_resnet50_fpn,
    FasterRCNN_ResNet50_FPN_Weights,
)
from torchvision.models import ResNet50_Weights
import torch.optim as optim
import os

In [3]:
csv_dir = "./data/global-wheat-detection/train.csv"
image_dir = "./data/global-wheat-detection/train"
image_size = 1024

df = pd.read_csv(csv_dir)
df.head()

Unnamed: 0,image_id,width,height,bbox,source
0,b6ab77fd7,1024,1024,"[834.0, 222.0, 56.0, 36.0]",usask_1
1,b6ab77fd7,1024,1024,"[226.0, 548.0, 130.0, 58.0]",usask_1
2,b6ab77fd7,1024,1024,"[377.0, 504.0, 74.0, 160.0]",usask_1
3,b6ab77fd7,1024,1024,"[834.0, 95.0, 109.0, 107.0]",usask_1
4,b6ab77fd7,1024,1024,"[26.0, 144.0, 124.0, 117.0]",usask_1


In [4]:
# Split image_id 80/20 for for train/val set
train_ids, val_ids = train_test_split(
    df["image_id"].unique().tolist(), test_size=0.2, random_state=42, shuffle=True
)

# Get corresponding image data into dataframe
train_df = df.loc[df["image_id"].isin(train_ids)]
val_df = df.loc[df["image_id"].isin(val_ids)]

# Check if split correctly
print(f"Train images: {len(train_ids)}")
print(f"Val images: {len(val_ids)}")
print(f"Train bbox rows: {len(train_df)}")
print(f"Val bbox rows: {len(val_df)}")

# Check average boxes per image
print(f"Avg boxes per image (train): {len(train_df) / len(train_ids):.2f}")
print(f"Avg boxes per image (val): {len(val_df) / len(val_ids):.2f}")

assert set(train_ids).isdisjoint(set(val_ids)), "Data leakage detected!"

Train images: 2698
Val images: 675
Train bbox rows: 118371
Val bbox rows: 29422
Avg boxes per image (train): 43.87
Avg boxes per image (val): 43.59


In [5]:
# Return tensors and target dict for each dataset
train_dataset = wheatDataset(train_df, image_dir, image_size, mode="train")
val_dataset = wheatDataset(val_df, image_dir, image_size, mode="validation")

train_image, train_target = train_dataset[0]
val_image, val_target = val_dataset[0]

# Check if correct
print(f"Train image shape: {train_image.shape}")
print(f"Train dataset boxes shape: {train_target['boxes'].shape}")

print(f"Val image shape: {val_image.shape}")
print(f"Val dataset boxes shape: {val_target['boxes'].shape}")

Train image shape: torch.Size([3, 1024, 1024])
Train dataset boxes shape: torch.Size([24, 4])
Val image shape: torch.Size([3, 1024, 1024])
Val dataset boxes shape: torch.Size([47, 4])


In [6]:
# Custom collate function since bbox tensors of different shape cannot be stacked
def collate_fn(batch):
    # Separating batch contents from dataloader
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]

    # Stack images
    images = torch.stack(images)

    # Return tuple with stacked images but non-stacked targets
    return (images, targets)

In [7]:
# Data must be in the format that model wants, images have to be augmented
train_loader = DataLoader(
    train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn
)

val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

for images, targets in train_loader:
    print(images.shape)
    print(len(images))
    print(type(targets))
    print(len(targets))
    print(targets[0].keys())
    print(targets[0]["boxes"].shape)
    break

torch.Size([2, 3, 1024, 1024])
2
<class 'list'>
2
dict_keys(['boxes', 'area', 'labels', 'iscrowd'])
torch.Size([50, 4])


In [8]:
print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")

PyTorch version: 2.9.0
MPS available: True


In [9]:
# Set device as mps
device = torch.device("cpu")

# Initialize model
model = fasterrcnn_resnet50_fpn(
    weights=None, weights_backbone=ResNet50_Weights.DEFAULT, num_classes=2
)

# Bring model to device
model = model.to(device)

# Set model to train mode
model.train()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

In [10]:
# Get one iteration
images, targets = next(iter(train_loader))

# Send image and targets to device
images = images.to(device)

# Nested list comprehension that moves all tensors in target dict to device
targets = [
    {key: value.to(device) for key, value in target.items()} for target in targets
]

# Forward pass
loss_dict = model(images, targets)

# Calculate total loss
losses = sum(loss for loss in loss_dict.values())
print(f"Total loss: {losses.item()}")
print(f"Loss components: {loss_dict}")

# Backward pass
losses.backward()
print("Backward pass completed")

Total loss: 1.7287588119506836
Loss components: {'loss_classifier': tensor(0.7036, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.1418, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.7361, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.1472, grad_fn=<DivBackward0>)}
Backward pass completed


In [11]:
num_epochs = 2
lr = 0.0001
print_freq = 50

optimizer = optim.AdamW(model.parameters(), lr=lr)

In [None]:
# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1} / {num_epochs}")

    # Set model to training mode
    model.train()

    for batch_idx, (images, targets) in enumerate(train_loader):
        # Load train_data into device
        images = images.to(device)
        targets = [
            {key: value.to(device) for key, value in target.items()}
            for target in targets
        ]

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        loss_dict = model(images, targets)

        # Convert loss dict to scalar
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass to compute gradients
        losses.backward()

        # Gradient update with optimizer
        optimizer.step()

        # Print batch_idx
        if batch_idx % print_freq == 0:
            print(f"Batch index: {batch_idx}/{len(train_loader)}")
            print(f"Loss: {losses.item():.4f}")

Epoch 1 / 2
Batch index: 0/1349
Loss: 2.3506
Batch index: 50/1349
Loss: 1.4337
Batch index: 100/1349
Loss: 1.6045
Batch index: 150/1349
Loss: 1.4779
Batch index: 200/1349
Loss: 1.1758
Batch index: 250/1349
Loss: 1.1516
Batch index: 300/1349
Loss: 1.2481
Batch index: 350/1349
Loss: 0.9951
Batch index: 400/1349
Loss: 1.0389
Batch index: 450/1349
Loss: 0.9663
Batch index: 500/1349
Loss: 1.0426
Batch index: 550/1349
Loss: 1.0050
Batch index: 600/1349
Loss: 1.0232
Batch index: 650/1349
Loss: 0.8992
Batch index: 700/1349
Loss: 0.9892
Batch index: 750/1349
Loss: 0.9082
Batch index: 800/1349
Loss: 0.9075
Batch index: 850/1349
Loss: 0.8665
Batch index: 900/1349
Loss: 0.8570
Batch index: 950/1349
Loss: 0.8995
Batch index: 1000/1349
Loss: 0.8605
Batch index: 1050/1349
Loss: 1.1186
Batch index: 1100/1349
Loss: 0.9336
Batch index: 1150/1349
Loss: 0.9503
Batch index: 1200/1349
Loss: 0.9945
Batch index: 1250/1349
Loss: 0.8562
Batch index: 1300/1349
Loss: 1.0820
Epoch 2 / 2
Batch index: 0/1349
Loss: 0

In [None]:
# Validation loop
val_losses = 0.0

# Put model to evaluation mode
model.eval()

with torch.no_grad():
    for images, targets in val_loader:
        images = images.to(device)
        targets = [
            {key: value.to(device) for key, value in target.items()}
            for target in targets
        ]

        # Compute loss
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Append loss
        val_losses += losses.item()

avg_val_loss = val_losses / len(val_loader)
print(f"Val_loss: {avg_val_loss:.4f}")


In [None]:
# Combine together
# Make directory to save models
os.makedirs("./checkpoints/models", exist_ok=True)


## Add tqdm next time
optimizer = optim.AdamW(model.parameters(), lr=lr)

# Initialize best val loss
best_val_loss = float("inf")

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1} / {num_epochs}")

    # Set model to train mode
    model.train()

    ## Initialize tracking variables
    train_losses = 0.0
    val_losses = 0.0

    for batch_idx, (images, targets) in enumerate(train_loader):
        # Load train_data into device
        images = images.to(device)
        targets = [
            {key: value.to(device) for key, value in target.items()}
            for target in targets
        ]

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        loss_dict = model(images, targets)

        # Convert loss dict to scalar
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass to compute gradients
        losses.backward()

        # Gradient update with optimizer
        optimizer.step()

        # Calculate train tracking metrics
        train_losses += losses.item()

        # Print batch_idx and loss
        if batch_idx % print_freq == 0:
            print(f"Batch index: {batch_idx}/{len(train_loader)}")
            print(f"Batch loss: {losses.item():.4f}")

    # Put model to evaluation mode
    model.eval()

    with torch.no_grad():
        for images, targets in val_loader:
            images = images.to(device)
            targets = [
                {key: value.to(device) for key, value in target.items()}
                for target in targets
            ]

            # Compute loss
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            # Append loss
            val_losses += losses.item()

    # Calculate tracking metrics
    avg_train_loss = train_losses / len(train_loader)
    avg_val_loss = val_losses / len(val_loader)

    # Print metrics
    print(f"Train_loss: {avg_train_loss:.4f}")
    print(f"Val_loss: {avg_val_loss:.4f}")

    # Save model if avg_val_loss improves
    ## Save to mlflow next time
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(
            {
                "epoch": epoch,
                "model_save_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "train_loss": avg_train_loss,
                "val_loss": avg_val_loss,
            },
            "./checkpoints/models/best_model.pth",
        )
        print(f"Saved best model (val_loss: {avg_val_loss:.4f})")

In [None]:
# Inference (testing first)

# Get one batch from val_loader
images, targets = next(iter(val_loader))

# Set to eval mode
model.eval()

# Inference
with torch.no_grad():
    predictions = model(images)

# Inspect output
print(f"Number of images: {len(predictions)}")
print(f"First prediction keys: {predictions[0].keys()}")
print(f"Boxes shape: {predictions[0]['boxes'].shape}")
print(f"Scores: {predictions[0]['scores']}")
print(f"Labels: {predictions[0]['labels']}")

Steps:
1. Prepare data with dataset
2. Train test split
3. Dataloader setup
4. Model setup
5. train loop
6. Validation
7. Inference