In [4]:
!pip install albumentations

Collecting albumentations
  Downloading albumentations-1.4.17-py3-none-any.whl.metadata (38 kB)
Collecting pydantic>=2.7.0 (from albumentations)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting albucore==0.0.17 (from albumentations)
  Downloading albucore-0.0.17-py3-none-any.whl.metadata (3.1 kB)
Collecting eval-type-backport (from albumentations)
  Downloading eval_type_backport-0.2.0-py3-none-any.whl.metadata (2.2 kB)
Collecting opencv-python-headless>=4.9.0.80 (from albumentations)
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl.metadata (20 kB)
Collecting pydantic-core==2.23.4 (from pydantic>=2.7.0->albumentations)
  Downloading pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading albumentations-1.4.17-py3-none-any.whl (216 kB)
[2K   [90m━━━━━━━━━━━━

In [26]:
import os
from torch.utils.data import Dataset
from PIL import Image
import numpy as np
import re
import torch

import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2
import random
from albumentations import Compose, Resize, Normalize
from albumentations.pytorch import ToTensorV2


# Helper Function
def extract_number(filename):
    """
    Extracts the numerical part from a filename.
    For example, '2.jpg' returns 2.
    """
    match = re.search(r'(\d+)', filename)
    if match:
        return int(match.group(1))
    else:
        return -1  # Return -1 if no number is found


# Custom Dataset Class
class RubiksCubeDataset(Dataset):
    def __init__(self, root_dir, transforms=None):
        self.root_dir = root_dir
        self.images_dir = os.path.join(root_dir, 'images')
        self.labels_dir = os.path.join(root_dir, 'labels')
        
        self.image_files = sorted([
            f for f in os.listdir(self.images_dir) 
            if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff'))
        ])
        
        self.label_files = []
        for img_file in self.image_files:
            label_file = img_file.rsplit('.', 1)[0] + '.txt'
            label_path = os.path.join(self.labels_dir, label_file)
            if os.path.isfile(label_path):
                self.label_files.append(label_file)
            else:
                raise FileNotFoundError(f"Label file {label_file} not found for image {img_file}")
        
        self.transforms = transforms

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        label_name = self.label_files[idx]
        
        img_path = os.path.join(self.images_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        width, height = image.size
        
        label_path = os.path.join(self.labels_dir, label_name)
        with open(label_path, 'r') as f:
            lines = f.readlines()
        
        mask = np.zeros((height, width), dtype=np.uint8)
        
        for line in lines:
            parts = line.strip().split()
            if len(parts) >= 5:
                class_id = parts[0]
                if class_id != '0':
                    continue  # Skip other classes if any
                try:
                    # Parse YOLO format: class_id x_center y_center width height
                    x_center_norm, y_center_norm, box_width_norm, box_height_norm = map(float, parts[1:5])
                    
                    # Convert normalized coordinates to pixel values
                    x_center = int(x_center_norm * width)
                    y_center = int(y_center_norm * height)
                    box_width = int(box_width_norm * width)
                    box_height = int(box_height_norm * height)
                    
                    # Enforce square by taking the smaller dimension
                    min_dim = min(box_width, box_height)
                    half_min_dim = min_dim // 2
                    
                    # Calculate top-left and bottom-right coordinates for square
                    x1 = x_center - half_min_dim
                    y1 = y_center - half_min_dim
                    x2 = x_center + half_min_dim
                    y2 = y_center + half_min_dim
                    
                    # Ensure coordinates are within image boundaries
                    x1, x2 = max(0, min(x1, width - 1)), max(0, min(x2, width - 1))
                    y1, y2 = max(0, min(y1, height - 1)), max(0, min(y2, height - 1))
                    
                    # Draw filled square on mask
                    cv2.rectangle(mask, (x1, y1), (x2, y2), color=255, thickness=-1)
                except ValueError:
                    print(f"Invalid label format in file {label_name}: '{line}'")
            else:
                print(f"Incomplete label information in file {label_name}: '{line}'")
        
        if self.transforms:
            augmented = self.transforms(image=np.array(image), mask=mask)
            image = augmented['image']
            mask = augmented['mask']
        
        return image, mask

# Define Transformations
transforms = Compose([
    Resize(256, 256),
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

# Initialize Dataset
try:
    dataset = RubiksCubeDataset(root_dir='archive', transforms=transforms)  # Update this path if necessary
    print(f"Dataset loaded successfully with {len(dataset)} samples.")
except FileNotFoundError as e:
    print(e)
    exit(1)

# Visualization Function
def visualize_samples(dataset, num_samples=5):
    indices = random.sample(range(len(dataset)), num_samples)
    
    for idx in indices:
        image, mask = dataset[idx]
        
        image_vis = image.permute(1, 2, 0).numpy()
        image_vis = np.clip(image_vis, 0, 1)
        
        mask_np = mask.numpy()
        mask_np = (mask_np > 0).astype(np.float32)
        
        # Create an overlay where everything outside the mask is black
        overlay = np.zeros_like(image_vis)
        overlay[mask_np == 1] = image_vis[mask_np == 1]
        
        plt.figure(figsize=(15, 5))
        
        plt.subplot(1, 3, 1)
        plt.title('Image')
        plt.imshow(image_vis)
        plt.axis('off')
        
        plt.subplot(1, 3, 2)
        plt.title('Mask')
        plt.imshow(mask_np, cmap='gray')
        plt.axis('off')
        
        plt.subplot(1, 3, 3)
        plt.title('Overlay')
        plt.imshow(overlay)
        plt.axis('off')
        
        plt.show()

# Visualize Samples
visualize_samples(dataset, num_samples=5)

Label file 72 (1).txt not found for image 72 (1).jpg


KeyError: 'You have to pass data to augmentations as named arguments, for example: aug(image=image)'

In [13]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch

# Define a transformation function
def get_transform(train=True):
    if train:
        return A.Compose([
            A.HorizontalFlip(0.5),
            A.RandomBrightnessContrast(p=0.2),
            A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, p=0.3),
            A.Resize(256, 256),
            A.Normalize(mean=(0.485, 0.456, 0.406),
                        std=(0.229, 0.224, 0.225)),
            ToTensorV2(),
        ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))
    else:
        return A.Compose([
            A.Resize(256, 256),
            A.Normalize(mean=(0.485, 0.456, 0.406),
                        std=(0.229, 0.224, 0.225)),
            ToTensorV2(),
        ], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))

In [14]:
from torch.utils.data import random_split

# Initialize the dataset with transformations
dataset = RubiksCubeDataset(
    root_dir='archive',  # Update this path if necessary
    transforms=get_transform(train=True)
)

# Determine lengths for splits
total_size = len(dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

# Perform the split
train_dataset, val_dataset, test_dataset = random_split(
    dataset, 
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)  # Ensure reproducibility
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Testing samples: {len(test_dataset)}")

Found 988 image-label pairs.
Training samples: 790
Validation samples: 98
Testing samples: 100


In [20]:
from torch.utils.data import DataLoader

# Define the batch size
batch_size = 8

# Create DataLoaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True,  # Shuffle for training
    num_workers=4,
    collate_fn=lambda x: tuple(zip(*x))  # Necessary for object detection
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=4,
    collate_fn=lambda x: tuple(zip(*x))
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=4,
    collate_fn=lambda x: tuple(zip(*x))
)

print("DataLoaders created successfully.")

DataLoaders created successfully.


In [24]:
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# Check if CUDA is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

# Load a pre-trained Faster R-CNN model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

# Get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features

# Replace the pre-trained head with a new one for our dataset
num_classes = 2  # 1 class (Rubik's Cube) + background
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

# Move the model to the appropriate device
model.to(device)

# Define an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

# Define a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Number of epochs
num_epochs = 10

Using device: cpu


Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /Users/ludvigeriksonbrangstrup/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:15<00:00, 10.6MB/s] 


In [25]:
from torchvision.models.detection import FasterRCNN
from tqdm import tqdm

# Define a utility function to train for one epoch
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=100):
    model.train()
    metric_logger = tqdm(data_loader, desc=f"Epoch {epoch+1}")

    for images, targets in metric_logger:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # Forward pass
        loss_dict = model(images, targets)

        # Compute total loss
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass and optimization
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        metric_logger.set_postfix(loss=losses.item())

# Training Loop
for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, train_loader, device, epoch)
    lr_scheduler.step()

    # Save the model after each epoch
    torch.save(model.state_dict(), f"fasterrcnn_epoch_{epoch+1}.pth")

print("Training Completed.")

Epoch 1:   0%|          | 0/99 [00:00<?, ?it/s]


KeyError: 'You have to pass data to augmentations as named arguments, for example: aug(image=image)'