In [1]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import xml.etree.ElementTree as ET
import torch
import torchvision
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from PIL import Image
import os
import torch.nn as nn

cuda_available = torch.cuda.is_available()

print(f"CUDA Available: {cuda_available}")

if cuda_available:
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. PyTorch will use the CPU.")

CUDA Available: True
GPU 0: NVIDIA GeForce RTX 3080


In [2]:
# Cell 2: Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, root_dir, split="train", transform=None):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.images = []
        self.annotations = []
        self.no_annotation_count = 0

        images_dir = os.path.join(root_dir, "raw-images")
        annotations_dir = os.path.join(root_dir, "annotations")

        for subfolder in os.listdir(images_dir):
            subfolder_images_dir = os.path.join(images_dir, subfolder)
            subfolder_annotations_dir = os.path.join(annotations_dir, subfolder)

            if not os.path.isdir(subfolder_annotations_dir):
                continue

            image_files = [f for f in os.listdir(subfolder_images_dir) if f.endswith(".jpg") or f.endswith(".png")]

            image_files.sort()

            num_images = len(image_files)
            if split == "train":
                image_files = image_files[:int(0.7 * num_images)]
            elif split == "val":
                image_files = image_files[int(0.7 * num_images):int(0.9 * num_images)]
            elif split == "test":
                image_files = image_files[int(0.9 * num_images):]
            else:
                raise ValueError(f"Invalid split: {split}")

            for filename in image_files:
                image_path = os.path.join(subfolder_images_dir, filename)
                annotation_path = os.path.join(subfolder_annotations_dir, os.path.splitext(filename)[0] + ".xml")

                if os.path.exists(annotation_path):
                    tree = ET.parse(annotation_path)
                    root = tree.getroot()
                    annotation = []

                    for obj in root.findall("object"):
                        name = obj.find("name").text
                        bbox = obj.find("bndbox")
                        xmin = int(bbox.find("xmin").text)
                        ymin = int(bbox.find("ymin").text)
                        xmax = int(bbox.find("xmax").text)
                        ymax = int(bbox.find("ymax").text)
                        annotation.append((xmin, ymin, xmax, ymax))

                    self.images.append(image_path)
                    self.annotations.append(annotation)
                else:
                    self.no_annotation_count += 1

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image_path = self.images[index]
        image = Image.open(image_path).convert("L")
        annotation = self.annotations[index]

        if self.transform:
            image = self.transform(image)

        if annotation is not None:
            boxes = torch.as_tensor(annotation, dtype=torch.float32)
            labels = torch.ones((len(annotation),), dtype=torch.int64)
        else:
            boxes = torch.empty((0, 4), dtype=torch.float32)
            labels = torch.empty((0,), dtype=torch.int64)

        return image, {"boxes": boxes, "labels": labels}

In [3]:
 # Test the dataset
dataset_path = "dataset"  
split = "train"  
thermal_transform = transforms.Compose([
    transforms.Resize((800, 800)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

thermal_dataset = CustomDataset(dataset_path, split, thermal_transform)
print(f"Number of images in the dataset: {len(thermal_dataset)}")
image, target = thermal_dataset[0]
print(f"Image shape: {image.shape}")
print(f"Target boxes: {target['boxes']}")
print(f"Target labels: {target['labels']}")

Number of images in the dataset: 857
Image shape: torch.Size([1, 800, 800])
Target boxes: tensor([[367., 137., 407., 153.],
        [238., 125., 279., 145.],
        [ 95., 131., 135., 144.],
        [578., 136., 607., 149.]])
Target labels: tensor([1, 1, 1, 1])


In [4]:
# Cell 3: Dataset Preprocessing
def preprocess_dataset(dataset):
    preprocessed_images = []
    preprocessed_annotations = []
    
    for image, target in dataset:
        if isinstance(image, torch.Tensor):
            image = transforms.ToPILImage()(image)
        else:
            image = Image.fromarray(image)
        
        image = thermal_transform(image)
        
        boxes = target['boxes']
        labels = target['labels']
        
        _, height, width = image.shape
        boxes[:, [0, 2]] /= width
        boxes[:, [1, 3]] /= height
        
        target = {'boxes': boxes, 'labels': labels}
        
        preprocessed_images.append(image)
        preprocessed_annotations.append(target)
    
    return preprocessed_images, preprocessed_annotations

# Test the preprocessing function
dataset_path = "dataset"
split = "train"
thermal_dataset = CustomDataset(dataset_path, split=split, transform=thermal_transform)
preprocessed_images, preprocessed_annotations = preprocess_dataset(thermal_dataset)
print(f"Number of preprocessed images: {len(preprocessed_images)}")
print(f"Number of preprocessed annotations: {len(preprocessed_annotations)}")
print(f"Preprocessed image shape: {preprocessed_images[0].shape}")
print(f"Preprocessed annotation boxes shape: {preprocessed_annotations[0]['boxes'].shape}")
print(f"Preprocessed annotation labels shape: {preprocessed_annotations[0]['labels'].shape}")

Number of preprocessed images: 857
Number of preprocessed annotations: 857
Preprocessed image shape: torch.Size([1, 800, 800])
Preprocessed annotation boxes shape: torch.Size([4, 4])
Preprocessed annotation labels shape: torch.Size([4])


In [5]:
# Cell 4: Dataset and DataLoader Creation
dataset_path = "dataset"
split = "train"

thermal_transform = transforms.Compose([
    transforms.Resize((800, 800)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

thermal_dataset = CustomDataset(dataset_path, split, thermal_transform)

class_labels = set()
for _, annotation in thermal_dataset:
    for obj in annotation:
        name = obj[0]
        class_labels.add("vehicle")  

class_to_idx = {"vehicle": 0}  
print("Class labels:", class_to_idx)

preprocessed_thermal_images, preprocessed_thermal_annotations = preprocess_dataset(thermal_dataset)

def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    
    images = torch.stack(images, dim=0)
    
    return images, targets

train_thermal_dataset = list(zip(preprocessed_thermal_images, preprocessed_thermal_annotations))
train_thermal_loader = DataLoader(train_thermal_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

for images, targets in train_thermal_loader:
    print(f"Batch images shape: {images.shape}")
    print(f"Batch targets boxes shape: {targets[0]['boxes'].shape}")
    print(f"Batch targets labels shape: {targets[0]['labels'].shape}")
    break

Class labels: {'vehicle': 0}
Batch images shape: torch.Size([4, 1, 800, 800])
Batch targets boxes shape: torch.Size([2, 4])
Batch targets labels shape: torch.Size([2])


In [6]:
# Cell 5: Model Definition and Training
num_classes = len(class_to_idx) + 1

thermal_model = fasterrcnn_resnet50_fpn(weights="DEFAULT")
in_features = thermal_model.roi_heads.box_predictor.cls_score.in_features
thermal_model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
thermal_model.to(device)

thermal_optimizer = torch.optim.SGD(thermal_model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

num_epochs = 10

for epoch in range(num_epochs):
    thermal_model.train()
    
    thermal_epoch_loss = 0.0
    
    for thermal_images, thermal_targets in train_thermal_loader:
        thermal_images = list(image.to(device) for image in thermal_images)
        thermal_targets = [{k: v.to(device) for k, v in t.items()} for t in thermal_targets]
        
        thermal_loss_dict = thermal_model(thermal_images, thermal_targets)
        thermal_losses = sum(loss for loss in thermal_loss_dict.values())
        
        thermal_optimizer.zero_grad()
        thermal_losses.backward()
        thermal_optimizer.step()
        
        thermal_epoch_loss += thermal_losses.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Thermal Loss: {thermal_epoch_loss/len(train_thermal_loader):.4f}")

torch.save(thermal_model.state_dict(), "trained_model.pth")

thermal_model.eval()
with torch.no_grad():
    for images, targets in train_thermal_loader:
        images = list(image.to(device) for image in images)
        outputs = thermal_model(images)
        
        for i in range(len(images)):
            boxes = outputs[i]['boxes'].cpu().numpy()
            labels = outputs[i]['labels'].cpu().numpy()
            scores = outputs[i]['scores'].cpu().numpy()
            
            print(f"Image {i+1} - Boxes: {boxes}, Labels: {labels}, Scores: {scores}")
        
        break

Epoch [1/10], Thermal Loss: 0.3427
Epoch [2/10], Thermal Loss: 0.2107
Epoch [3/10], Thermal Loss: 0.1985
Epoch [4/10], Thermal Loss: 0.1669
Epoch [5/10], Thermal Loss: 0.1803
Epoch [6/10], Thermal Loss: 0.1580
Epoch [7/10], Thermal Loss: 0.1472
Epoch [8/10], Thermal Loss: 0.1381
Epoch [9/10], Thermal Loss: 0.1270
Epoch [10/10], Thermal Loss: 0.1249
Image 1 - Boxes: [[2.96203613e-01 1.36123955e-01 5.61125755e-01 3.00637633e-01]
 [4.70546067e-01 2.30922759e-01 7.26506531e-01 3.94213796e-01]
 [0.00000000e+00 1.56415179e-02 1.35711133e-01 1.89569771e-01]
 [0.00000000e+00 1.07269734e-04 9.66504887e-02 8.63714963e-02]
 [1.81872070e-01 3.24680328e-01 4.19571936e-01 5.19044816e-01]
 [6.60692990e-01 1.50251776e-01 9.17660058e-01 3.67387027e-01]
 [0.00000000e+00 3.26221436e-01 6.01409450e-02 4.80619520e-01]
 [6.02854490e-01 2.74631739e-01 8.80982995e-01 4.58271921e-01]
 [5.33870220e-01 4.34935272e-01 8.12467933e-01 6.26760542e-01]
 [2.32234582e-01 5.58165789e-01 5.30056715e-01 7.81255841e-01]
 [

In [7]:
# Cell 6: Evaluation and Testing
def evaluate_model(model, dataloader, device):
    model.eval()
    
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for images, targets in dataloader:
            images = list(image.to(device) for image in images)
            outputs = model(images)
            
            for output in outputs:
                boxes = output['boxes'].cpu().numpy()
                labels = output['labels'].cpu().numpy()
                scores = output['scores'].cpu().numpy()
                
                indices = torchvision.ops.nms(torch.tensor(boxes), torch.tensor(scores), iou_threshold=0.5)
                
                filtered_boxes = boxes[indices]
                filtered_labels = labels[indices]
                filtered_scores = scores[indices]
                
                all_predictions.append((filtered_boxes, filtered_labels, filtered_scores))
            
            for target in targets:
                boxes = target['boxes'].cpu().numpy()
                labels = target['labels'].cpu().numpy()
                
                all_targets.append((boxes, labels))
    
    return all_predictions, all_targets

test_split = "test"
test_thermal_dataset = CustomDataset(dataset_path, test_split, thermal_transform)
test_thermal_loader = DataLoader(test_thermal_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

test_predictions, test_targets = evaluate_model(thermal_model, test_thermal_loader, device)
print(f"Number of test predictions: {len(test_predictions)}")
print(f"Number of test targets: {len(test_targets)}")
print(f"Test prediction boxes shape: {test_predictions[0][0].shape}")
print(f"Test prediction labels shape: {test_predictions[0][1].shape}")
print(f"Test prediction scores shape: {test_predictions[0][2].shape}")

Number of test predictions: 96
Number of test targets: 96
Test prediction boxes shape: (22, 4)
Test prediction labels shape: (22,)
Test prediction scores shape: (22,)


In [8]:
# Cell 7: Load the trained model
thermal_model.load_state_dict(torch.load("trained_model.pth"))
thermal_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [9]:
# Cell 8: Prepare the test dataset
test_split = "test"
test_thermal_dataset = CustomDataset(dataset_path, test_split, thermal_transform)
test_thermal_loader = DataLoader(test_thermal_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [10]:
# Cell 9: Evaluate the model on the test dataset
test_predictions, test_targets = evaluate_model(thermal_model, test_thermal_loader, device)

unique_labels = np.unique(labels)
print("Unique labels:", unique_labels)
print(f"Boxes: {boxes}")
print(f"Labels: {labels}")
print(f"Scores: {scores}")
print(f"Number of images without annotations: {thermal_dataset.no_annotation_count}")

Unique labels: [1]
Boxes: [[6.4892268e-01 5.2574277e-04 2.1098313e+00 3.3333457e-01]
 [5.3284293e-01 3.0112885e-02 6.0988063e-01 1.0313583e-01]
 [5.0038636e-01 7.0981979e-02 5.5989456e-01 1.2911846e-01]
 [6.3298738e-01 2.5217253e-01 7.4741614e-01 3.5326123e-01]
 [7.7853554e-01 1.5418486e-01 8.5660797e-01 2.0973821e-01]
 [7.2105879e-01 2.5380462e-01 8.2891446e-01 3.5574496e-01]
 [8.5551065e-01 1.9505620e-05 9.2419022e-01 3.6237940e-02]
 [5.8791912e-01 3.6273229e-01 6.7032361e-01 4.1463739e-01]
 [9.2467409e-01 2.1879494e-01 1.0311036e+00 2.9868078e-01]
 [7.8938574e-01 3.7241775e-01 8.9305443e-01 4.6381968e-01]
 [0.0000000e+00 0.0000000e+00 1.5579239e+00 1.1858211e+00]
 [0.0000000e+00 0.0000000e+00 3.7617073e+00 1.3579860e-01]
 [1.0816288e+00 1.2681502e-01 1.2004936e+00 2.2106850e-01]
 [7.6999503e-01 4.8613364e-01 8.6257261e-01 5.6518155e-01]
 [7.7848643e-01 5.7330608e-01 8.8287407e-01 6.5924048e-01]
 [3.2343864e-03 0.0000000e+00 7.1953254e+00 4.4128370e+00]
 [1.4322284e+00 2.8677067e-01 

In [11]:
# Cell 10: Visualize the object detection results
def visualize_detections(image, boxes, labels, scores, class_labels, confidence_threshold=0.3):
    image_with_detections = image.copy()
    
    height, width, _ = image.shape
    
    if not isinstance(boxes, (list, np.ndarray)):
        boxes = [boxes]
    
    if not isinstance(labels, (list, np.ndarray)):
        labels = [labels]
    
    if not isinstance(scores, (list, np.ndarray)):
        scores = [scores]
    
    for box, label, score in zip(boxes, labels, scores):
        if score >= confidence_threshold:
            if isinstance(box, (list, np.ndarray)):
                xmin, ymin, xmax, ymax = box
            else:
                xmin, ymin, xmax, ymax = box, box, box, box  
            xmin = int(xmin * width)
            ymin = int(ymin * height)
            xmax = int(xmax * width)
            ymax = int(ymax * height)
            
            class_name = class_labels[int(label)]  
            
            cv2.rectangle(image_with_detections, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
            cv2.putText(image_with_detections, f"{class_name}: {score:.2f}", (xmin, ymin - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    return image_with_detections

num_visualizations = 5
class_labels = {1: "vehicle"}

for i in range(num_visualizations):
    image_path = test_thermal_dataset.images[i]
    image = cv2.imread(image_path)  
    
    boxes, labels, scores = test_predictions[i]
    
    print(f"Boxes: {boxes}")
    print(f"Labels: {labels}")
    print(f"Scores: {scores}")
    
    image_with_detections = visualize_detections(image, boxes, labels, scores, class_labels)
    
    cv2.imshow(f"Thermal Object Detection - Image {i+1}", image_with_detections)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

Boxes: [[3.9396513e-01 2.0144245e-01 5.1742059e-01 2.8571028e-01]
 [1.5011814e-01 1.7777261e-01 2.6834249e-01 2.6703545e-01]
 [3.4308720e-01 2.9332030e-01 4.6617764e-01 3.8002932e-01]
 [3.0338764e-03 7.8418851e-04 4.5578651e+00 4.8640338e-01]
 [0.0000000e+00 5.0993636e-02 4.4842772e-02 1.2939121e-01]
 [4.7636032e-04 3.5072565e-03 3.9685690e+00 2.9636202e+00]
 [0.0000000e+00 2.9401734e-01 2.9331569e-02 3.8412139e-01]
 [3.9556503e-01 3.7432188e-01 5.3714931e-01 4.6197116e-01]
 [1.0430813e-07 3.4196460e-01 7.3490515e-02 4.1916579e-01]
 [2.7000731e-01 4.4844887e-01 4.0845710e-01 5.3722715e-01]
 [9.3751949e-01 1.2277222e-01 9.8906451e-01 1.3974023e-01]
 [1.6775131e-02 3.9544702e-04 6.4189181e+00 2.1081114e-01]
 [4.9862805e-01 5.6400287e-01 6.2128168e-01 6.4839196e-01]
 [2.4764426e-01 6.0874677e-01 3.7972021e-01 6.9846725e-01]
 [2.4241447e-02 2.5455952e-03 6.5948119e+00 2.0660965e+00]
 [7.8524083e-01 4.6948206e-01 9.0449876e-01 5.4996550e-01]
 [7.7813625e-01 5.3095829e-01 8.9589632e-01 6.137

In [16]:
import cv2
import torch
import torchvision.transforms as transforms
from PIL import Image
import os
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

input_video = "raw-traffic-footage/bridge-1.mp4"

frames_dir = "extracted_frames"
os.makedirs(frames_dir, exist_ok=True)

video = cv2.VideoCapture(input_video)

frame_rate = video.get(cv2.CAP_PROP_FPS)
frame_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))

output_video = "output_video.mp4"
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
output = cv2.VideoWriter(output_video, fourcc, frame_rate, (frame_width, frame_height))

frame_count = 0

while True:
    ret, frame = video.read()
    
    if not ret:
        break
    
    frame_path = os.path.join(frames_dir, f"frame_{frame_count:05d}.jpg")
    cv2.imwrite(frame_path, frame)
    
    frame_count += 1

video.release()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = fasterrcnn_resnet50_fpn(weights=None)

in_features = model.roi_heads.box_predictor.cls_score.in_features

num_classes = 2  
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

model.load_state_dict(torch.load("trained_model.pth"))
model.to(device)  
model.eval()

for frame_file in sorted(os.listdir(frames_dir)):
    frame_path = os.path.join(frames_dir, frame_file)
    
    image = Image.open(frame_path).convert("RGB")
    
    transform = transforms.Compose([
        transforms.Resize((480, 480)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = transform(image)
    
    image = image.unsqueeze(0).to(device)  
    
    with torch.no_grad():
        outputs = model(image)
    
    boxes = outputs[0]['boxes'].cpu().numpy()
    labels = outputs[0]['labels'].cpu().numpy()
    scores = outputs[0]['scores'].cpu().numpy()
    
    print(f"Frame: {frame_file}")
    print(f"Boxes: {boxes}")
    print(f"Labels: {labels}")
    print(f"Scores: {scores}")
    
    frame = cv2.imread(frame_path)
    height, width, _ = frame.shape
    for box, label, score in zip(boxes, labels, scores):
        if score > 0.05:  
            x1, y1, x2, y2 = box
            x1 = int(x1 * width)
            y1 = int(y1 * height)
            x2 = int(x2 * width)
            y2 = int(y2 * height)
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"Label: {label}, Confidence: {score:.2f}", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    output.write(frame)

output.release()

import shutil
shutil.rmtree(frames_dir)

print("Video processing completed.")

Frame: frame_00000.jpg
Boxes: [[0.06668287 0.         3.2534604  1.5408939 ]]
Labels: [1]
Scores: [0.14949496]
Frame: frame_00001.jpg
Boxes: [[0.06645012 0.         3.255088   1.548187  ]]
Labels: [1]
Scores: [0.15117955]
Frame: frame_00002.jpg
Boxes: [[0.06646114 0.         3.254004   1.5469624 ]]
Labels: [1]
Scores: [0.1509601]
Frame: frame_00003.jpg
Boxes: [[0.06702247 0.         3.2564244  1.5375677 ]]
Labels: [1]
Scores: [0.14987797]
Frame: frame_00004.jpg
Boxes: [[0.06756607 0.         3.2662766  1.5594455 ]]
Labels: [1]
Scores: [0.14846393]
Frame: frame_00005.jpg
Boxes: [[0.06720357 0.         3.2757444  1.569858  ]]
Labels: [1]
Scores: [0.15207578]
Frame: frame_00006.jpg
Boxes: [[0.06675897 0.         3.2702334  1.5537395 ]]
Labels: [1]
Scores: [0.15201847]
Frame: frame_00007.jpg
Boxes: [[0.06675124 0.         3.27028    1.5537661 ]]
Labels: [1]
Scores: [0.15210178]
Frame: frame_00008.jpg
Boxes: [[0.06603227 0.         3.2949104  1.6350719 ]]
Labels: [1]
Scores: [0.15724693]
Fr