In [28]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import xml.etree.ElementTree as ET
import torch
import torchvision
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from PIL import Image
import os
import torch.nn as nn

cuda_available = torch.cuda.is_available()

print(f"CUDA Available: {cuda_available}")

if cuda_available:
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. PyTorch will use the CPU.")

CUDA Available: True
GPU 0: NVIDIA GeForce RTX 3080


In [29]:
# Cell 2: Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, root_dir, split="train", transform=None):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.images = []
        self.annotations = []

        images_dir = os.path.join(root_dir, "images")
        annotations_dir = os.path.join(root_dir, "annotations")

        image_files = [f for f in os.listdir(images_dir) if f.endswith(".jpg") or f.endswith(".png")]

        image_files.sort()

        num_images = len(image_files)
        if split == "train":
            image_files = image_files[:int(0.7 * num_images)]
        elif split == "val":
            image_files = image_files[int(0.7 * num_images):int(0.9 * num_images)]
        elif split == "test":
            image_files = image_files[int(0.9 * num_images):]
        else:
            raise ValueError(f"Invalid split: {split}")

        for filename in image_files:
            image_path = os.path.join(images_dir, filename)
            self.images.append(image_path)

            annotation_path = os.path.join(annotations_dir, os.path.splitext(filename)[0] + ".xml")
            if os.path.exists(annotation_path):
                tree = ET.parse(annotation_path)
                root = tree.getroot()
                annotation = []

                for obj in root.findall("object"):
                    name = obj.find("name").text
                    bbox = obj.find("bndbox")
                    xmin = int(bbox.find("xmin").text)
                    ymin = int(bbox.find("ymin").text)
                    xmax = int(bbox.find("xmax").text)
                    ymax = int(bbox.find("ymax").text)
                    annotation.append((xmin, ymin, xmax, ymax))

                self.annotations.append(annotation)
            else:
                self.annotations.append(None)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image_path = self.images[index]
        image = Image.open(image_path).convert("L")
        annotation = self.annotations[index]

        if self.transform:
            image = self.transform(image)

        if annotation is not None:
            boxes = torch.as_tensor(annotation, dtype=torch.float32)
            labels = torch.ones((len(annotation),), dtype=torch.int64)
        else:
            boxes = torch.empty((0, 4), dtype=torch.float32)
            labels = torch.empty((0,), dtype=torch.int64)

        return image, {"boxes": boxes, "labels": labels}

Number of images in the dataset: 297
Image shape: torch.Size([1, 800, 800])
Target boxes: tensor([[367., 137., 407., 153.],
        [238., 125., 279., 145.],
        [ 95., 131., 135., 144.],
        [578., 136., 607., 149.]])
Target labels: tensor([1, 1, 1, 1])


In [30]:
 # Test the dataset
dataset_path = "dataset"  
split = "train"  
thermal_transform = transforms.Compose([
    transforms.Resize((800, 800)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

thermal_dataset = CustomDataset(dataset_path, split, thermal_transform)
print(f"Number of images in the dataset: {len(thermal_dataset)}")
image, target = thermal_dataset[0]
print(f"Image shape: {image.shape}")
print(f"Target boxes: {target['boxes']}")
print(f"Target labels: {target['labels']}")

Number of images in the dataset: 297
Image shape: torch.Size([1, 800, 800])
Target boxes: tensor([[367., 137., 407., 153.],
        [238., 125., 279., 145.],
        [ 95., 131., 135., 144.],
        [578., 136., 607., 149.]])
Target labels: tensor([1, 1, 1, 1])


In [31]:
# Cell 3: Dataset Preprocessing
def preprocess_dataset(dataset):
    preprocessed_images = []
    preprocessed_annotations = []
    
    for image, target in dataset:
        if isinstance(image, torch.Tensor):
            image = transforms.ToPILImage()(image)
        else:
            image = Image.fromarray(image)
        
        image = thermal_transform(image)
        
        boxes = target['boxes']
        labels = target['labels']
        
        _, height, width = image.shape
        boxes[:, [0, 2]] /= width
        boxes[:, [1, 3]] /= height
        
        target = {'boxes': boxes, 'labels': labels}
        
        preprocessed_images.append(image)
        preprocessed_annotations.append(target)
    
    return preprocessed_images, preprocessed_annotations

# Test the preprocessing function
dataset_path = "dataset"
split = "train"
thermal_dataset = CustomDataset(dataset_path, split=split, transform=thermal_transform)
preprocessed_images, preprocessed_annotations = preprocess_dataset(thermal_dataset)
print(f"Number of preprocessed images: {len(preprocessed_images)}")
print(f"Number of preprocessed annotations: {len(preprocessed_annotations)}")
print(f"Preprocessed image shape: {preprocessed_images[0].shape}")
print(f"Preprocessed annotation boxes shape: {preprocessed_annotations[0]['boxes'].shape}")
print(f"Preprocessed annotation labels shape: {preprocessed_annotations[0]['labels'].shape}")

Number of preprocessed images: 297
Number of preprocessed annotations: 297
Preprocessed image shape: torch.Size([1, 800, 800])
Preprocessed annotation boxes shape: torch.Size([4, 4])
Preprocessed annotation labels shape: torch.Size([4])


In [32]:
# Cell 4: Dataset and DataLoader Creation
dataset_path = "dataset"
split = "train"

thermal_transform = transforms.Compose([
    transforms.Resize((800, 800)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

thermal_dataset = CustomDataset(dataset_path, split, thermal_transform)

class_labels = set()
for _, annotation in thermal_dataset:
    for obj in annotation:
        name = obj[0]
        class_labels.add("vehicle")  

class_to_idx = {"vehicle": 0}  
print("Class labels:", class_to_idx)

preprocessed_thermal_images, preprocessed_thermal_annotations = preprocess_dataset(thermal_dataset)

def collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    
    images = torch.stack(images, dim=0)
    
    return images, targets

train_thermal_dataset = list(zip(preprocessed_thermal_images, preprocessed_thermal_annotations))
train_thermal_loader = DataLoader(train_thermal_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

for images, targets in train_thermal_loader:
    print(f"Batch images shape: {images.shape}")
    print(f"Batch targets boxes shape: {targets[0]['boxes'].shape}")
    print(f"Batch targets labels shape: {targets[0]['labels'].shape}")
    break

Class labels: {'vehicle': 0}
Batch images shape: torch.Size([4, 1, 800, 800])
Batch targets boxes shape: torch.Size([5, 4])
Batch targets labels shape: torch.Size([5])


In [33]:
# Cell 5: Model Definition and Training
num_classes = len(class_to_idx) + 1

thermal_model = fasterrcnn_resnet50_fpn(weights="DEFAULT")
in_features = thermal_model.roi_heads.box_predictor.cls_score.in_features
thermal_model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
thermal_model.to(device)

thermal_optimizer = torch.optim.SGD(thermal_model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

num_epochs = 10

for epoch in range(num_epochs):
    thermal_model.train()
    
    thermal_epoch_loss = 0.0
    
    for thermal_images, thermal_targets in train_thermal_loader:
        thermal_images = list(image.to(device) for image in thermal_images)
        thermal_targets = [{k: v.to(device) for k, v in t.items()} for t in thermal_targets]
        
        thermal_loss_dict = thermal_model(thermal_images, thermal_targets)
        thermal_losses = sum(loss for loss in thermal_loss_dict.values())
        
        thermal_optimizer.zero_grad()
        thermal_losses.backward()
        thermal_optimizer.step()
        
        thermal_epoch_loss += thermal_losses.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Thermal Loss: {thermal_epoch_loss/len(train_thermal_loader):.4f}")

torch.save(thermal_model.state_dict(), "thermal_trained_model.pth")

thermal_model.eval()
with torch.no_grad():
    for images, targets in train_thermal_loader:
        images = list(image.to(device) for image in images)
        outputs = thermal_model(images)
        
        for i in range(len(images)):
            boxes = outputs[i]['boxes'].cpu().numpy()
            labels = outputs[i]['labels'].cpu().numpy()
            scores = outputs[i]['scores'].cpu().numpy()
            
            print(f"Image {i+1} - Boxes: {boxes}, Labels: {labels}, Scores: {scores}")
        
        break

Epoch [1/10], Thermal Loss: 0.4995
Epoch [2/10], Thermal Loss: 0.2011
Epoch [3/10], Thermal Loss: 0.2203
Epoch [4/10], Thermal Loss: 0.1574
Epoch [5/10], Thermal Loss: 0.1466
Epoch [6/10], Thermal Loss: 0.1545
Epoch [7/10], Thermal Loss: 0.1414
Epoch [8/10], Thermal Loss: 0.1244
Epoch [9/10], Thermal Loss: 0.1127
Epoch [10/10], Thermal Loss: 0.1178
Image 1 - Boxes: [[2.23930493e-01 1.95141882e-04 2.90383101e-01 9.22815949e-02]
 [2.80799270e-01 3.05030495e-04 3.56732011e-01 1.17054895e-01]
 [0.00000000e+00 6.94990158e-04 4.52226067e+00 2.56468511e+00]
 [2.46234685e-01 3.11119258e-02 3.22726220e-01 1.48866028e-01]
 [3.40033919e-01 2.61478126e-04 4.13563579e-01 8.37208480e-02]
 [4.06007826e-01 2.06591561e-04 4.86791015e-01 5.63654751e-02]
 [0.00000000e+00 2.32333586e-01 2.32761223e-02 3.60121608e-01]
 [4.94812243e-02 2.49403372e-01 1.18702486e-01 3.55359197e-01]
 [2.34523833e-01 3.13743949e-01 3.10134649e-01 4.40954447e-01]
 [2.78325021e-01 3.69665205e-01 3.50681007e-01 5.07565975e-01]
 [

In [34]:
# Cell 6: Evaluation and Testing
def evaluate_model(model, dataloader, device):
    model.eval()
    
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for images, targets in dataloader:
            images = list(image.to(device) for image in images)
            outputs = model(images)
            
            for output in outputs:
                boxes = output['boxes'].cpu().numpy()
                labels = output['labels'].cpu().numpy()
                scores = output['scores'].cpu().numpy()
                
                indices = torchvision.ops.nms(torch.tensor(boxes), torch.tensor(scores), iou_threshold=0.5)
                
                filtered_boxes = boxes[indices]
                filtered_labels = labels[indices]
                filtered_scores = scores[indices]
                
                all_predictions.append((filtered_boxes, filtered_labels, filtered_scores))
            
            for target in targets:
                boxes = target['boxes'].cpu().numpy()
                labels = target['labels'].cpu().numpy()
                
                all_targets.append((boxes, labels))
    
    return all_predictions, all_targets

test_split = "test"
test_thermal_dataset = CustomDataset(dataset_path, test_split, thermal_transform)
test_thermal_loader = DataLoader(test_thermal_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

test_predictions, test_targets = evaluate_model(thermal_model, test_thermal_loader, device)
print(f"Number of test predictions: {len(test_predictions)}")
print(f"Number of test targets: {len(test_targets)}")
print(f"Test prediction boxes shape: {test_predictions[0][0].shape}")
print(f"Test prediction labels shape: {test_predictions[0][1].shape}")
print(f"Test prediction scores shape: {test_predictions[0][2].shape}")

Number of test predictions: 43
Number of test targets: 43
Test prediction boxes shape: (12, 4)
Test prediction labels shape: (12,)
Test prediction scores shape: (12,)


In [35]:
# Cell 7: Load the trained model
thermal_model.load_state_dict(torch.load("thermal_trained_model.pth"))
thermal_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [36]:
# Cell 8: Prepare the test dataset
test_split = "test"
test_thermal_dataset = CustomDataset(dataset_path, test_split, thermal_transform)
test_thermal_loader = DataLoader(test_thermal_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [37]:
# Cell 9: Evaluate the model on the test dataset
test_predictions, test_targets = evaluate_model(thermal_model, test_thermal_loader, device)

unique_labels = np.unique(labels)
print("Unique labels:", unique_labels)
print(f"Boxes: {boxes}")
print(f"Labels: {labels}")
print(f"Scores: {scores}")

Unique labels: [1]
Boxes: [[1.63316727e-03 0.00000000e+00 4.39800119e+00 3.42175007e+00]
 [3.24170589e-01 3.20188701e-05 3.91984761e-01 4.45586070e-02]
 [2.37672061e-01 4.56824973e-02 2.97096640e-01 1.05936937e-01]
 [6.35591805e-01 1.23629346e-04 6.97340786e-01 3.37666944e-02]
 [3.26182783e-01 1.83787033e-01 3.73144388e-01 2.46052459e-01]
 [4.52269018e-01 2.24483341e-01 5.14335990e-01 2.82232672e-01]
 [6.79906487e-01 9.25416276e-02 7.38748312e-01 1.47923037e-01]
 [4.10041988e-01 3.09680820e-01 4.70882773e-01 3.71478260e-01]
 [2.07834587e-01 4.22493786e-01 2.71037698e-01 4.87482339e-01]
 [5.92565358e-01 2.78779745e-01 6.59012973e-01 3.49829733e-01]
 [5.23328960e-01 3.57561588e-01 5.80944002e-01 4.26767170e-01]
 [7.17453897e-01 3.14179242e-01 7.78865039e-01 3.79079640e-01]
 [8.15069318e-01 2.81825602e-01 8.80074024e-01 3.48421752e-01]
 [1.12091504e-01 6.91857696e-01 1.67222321e-01 7.50324607e-01]
 [8.55256021e-01 3.40426743e-01 9.13350642e-01 4.10310686e-01]
 [2.74384052e-01 7.39716768e-

In [38]:
# Cell 10: Visualize the object detection results
def visualize_detections(image, boxes, labels, scores, class_labels, confidence_threshold=0.3):
    image_with_detections = image.copy()
    
    height, width, _ = image.shape
    
    if not isinstance(boxes, (list, np.ndarray)):
        boxes = [boxes]
    
    if not isinstance(labels, (list, np.ndarray)):
        labels = [labels]
    
    if not isinstance(scores, (list, np.ndarray)):
        scores = [scores]
    
    for box, label, score in zip(boxes, labels, scores):
        if score >= confidence_threshold:
            if isinstance(box, (list, np.ndarray)):
                xmin, ymin, xmax, ymax = box
            else:
                xmin, ymin, xmax, ymax = box, box, box, box  
            xmin = int(xmin * width)
            ymin = int(ymin * height)
            xmax = int(xmax * width)
            ymax = int(ymax * height)
            
            class_name = class_labels[int(label)]  
            
            cv2.rectangle(image_with_detections, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
            cv2.putText(image_with_detections, f"{class_name}: {score:.2f}", (xmin, ymin - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    return image_with_detections

num_visualizations = 5
class_labels = {1: "vehicle"}

for i in range(num_visualizations):
    image_path = test_thermal_dataset.images[i]
    image = cv2.imread(image_path)  
    
    boxes, labels, scores = test_predictions[i]
    
    print(f"Boxes: {boxes}")
    print(f"Labels: {labels}")
    print(f"Scores: {scores}")
    
    image_with_detections = visualize_detections(image, boxes, labels, scores, class_labels)
    
    cv2.imshow(f"Thermal Object Detection - Image {i+1}", image_with_detections)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

Boxes: [[0.34770954 0.04174948 0.40519255 0.12731211]
 [0.42938927 0.01595853 0.4804578  0.09012715]
 [0.15285218 0.24474478 0.2219496  0.33774346]
 [0.7196178  0.07688493 0.7793055  0.15435433]
 [0.4142287  0.37830088 0.48401204 0.45835814]
 [0.22703454 0.5933771  0.2888647  0.67999935]
 [0.79546416 0.25732532 0.8478794  0.34908983]
 [0.3645666  0.5934224  0.41686067 0.68178606]
 [0.8320304  0.29618055 0.9021169  0.38711792]
 [1.1690063  0.20589468 1.2163565  0.29787752]
 [0.8595847  0.5023668  0.91113377 0.5867882 ]
 [0.9134542  0.5000308  0.96142614 0.5903397 ]
 [1.1557858  0.36425036 1.216984   0.45716965]]
Labels: [1 1 1 1 1 1 1 1 1 1 1 1 1]
Scores: [0.76316464 0.7598545  0.712865   0.62550384 0.5295133  0.39359325
 0.3854628  0.32807457 0.3132534  0.14409949 0.12858258 0.10668069
 0.07779296]
Boxes: [[1.8165039e-01 2.4084933e-05 2.1950047e-01 1.9544084e-02]
 [3.3406541e-06 5.8238138e-02 1.2824725e-02 1.0889727e-01]
 [2.1256378e-01 4.8016261e-02 2.5328329e-01 9.2824623e-02]
 [3.82

In [13]:
# Cell 11: Image Classification Dataset
class ImageClassificationDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, index):
        image_path = self.image_paths[index]
        image = Image.open(image_path).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
        
        if "vehicle" in image_path:
            label = 0
        else:
            label = -1  
        
        return image, label


vehicle_image_paths = [os.path.join("dataset", "train", "images", img) for img in os.listdir(os.path.join("dataset", "train", "images"))]
classification_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
dataset = ImageClassificationDataset(vehicle_image_paths, transform=classification_transform)
image, label = dataset[0]
print("Image shape:", image.shape)
print("Label:", label)

Image shape: torch.Size([3, 224, 224])
Label: -1


In [14]:
# Cell 12: Image Classification Dataset Creation
classification_dataset = ImageClassificationDataset(vehicle_image_paths, transform=classification_transform)
classification_dataloader = DataLoader(classification_dataset, batch_size=16, shuffle=True)

for images, labels in classification_dataloader:
    print("Batch shape:", images.shape)
    print("Labels:", labels)
    break

Batch shape: torch.Size([16, 3, 224, 224])
Labels: tensor([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])


In [15]:
# Cell 13: Image Classification Model
class_names = ["vehicle"]
num_classes = len(class_names)

classification_model = models.resnet18(weights='IMAGENET1K_V1')
num_features = classification_model.fc.in_features
classification_model.fc = nn.Linear(num_features, num_classes)

classification_model.to(device)

# Capture the model architecture summary in a string
model_summary = str(classification_model)

print("Model architecture:")
print(model_summary)

Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): R

In [16]:
# Cell 14: Image Classification Training
classification_criterion = nn.CrossEntropyLoss()
classification_optimizer = torch.optim.Adam(classification_model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    classification_model.train()
    
    for images, labels in classification_dataloader:
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = classification_model(images)
        loss = classification_criterion(outputs, labels)
        
        classification_optimizer.zero_grad()
        loss.backward()
        classification_optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Classification Loss: {loss.item():.4f}")

torch.save(classification_model.state_dict(), "classification_model.pth")

classification_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in classification_dataloader:
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = classification_model(images)
        _, predicted = torch.max(outputs.data, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Classification Accuracy: {accuracy:.2f}%")

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
