In [2]:
from ultralytics import YOLO

In [3]:
import torch

print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4080


In [4]:
import torch

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Using device: {device}")

Using device: cuda


In [None]:
# Enum class for model types: nano, medium, large
from enum import Enum

class ModelType(str, Enum):
    NANO = "nano"
    MEDIUM = "med"
    LARGE = "large"

model_type = ModelType.MEDIUM

model_name = f"./runs/obb/train_{model_type}/weights/best.pt"
model = YOLO(model_name)
model.to(device)

In [11]:
img = "./data/0/cropped_gm.png"

results = model.predict(img, device=device)


image 1/1 d:\Dokumenty\AAA_PW\Sem10\DataScienceWorkshop\data\0\cropped_gm.png: 608x640 16.1ms
Speed: 2.7ms preprocess, 16.1ms inference, 3.4ms postprocess per image at shape (1, 3, 608, 640)


In [14]:
results[0].obb

ultralytics.engine.results.OBB object with attributes:

cls: tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], device='cuda:0')
conf: tensor([0.6303, 0.6038, 0.5493, 0.5205, 0.4986, 0.4962, 0.4800, 0.4643, 0.4585, 0.4552, 0.4524, 0.4467, 0.4226, 0.3856, 0.3780, 0.3467, 0.3412, 0.3339, 0.3094, 0.3085, 0.3020, 0.2984, 0.2870, 0.2841, 0.2753, 0.2708, 0.2619, 0.2576, 0.2533, 0.2518], device='cuda:0')
data: tensor([[4.2416e+02, 3.0202e+02, 4.1166e+01, 1.9782e+01, 8.4108e-01, 6.3033e-01, 1.0000e+00],
        [4.0638e+02, 3.1797e+02, 4.2245e+01, 2.0488e+01, 8.5422e-01, 6.0382e-01, 1.0000e+00],
        [4.6245e+02, 5.2175e+01, 1.2928e+01, 8.2947e+00, 1.5448e+00, 5.4925e-01, 1.0000e+00],
        [6.2948e+02, 6.5695e+02, 4.2303e+01, 2.0364e+01, 2.8228e-01, 5.2054e-01, 1.0000e+00],
        [4.7589e+02, 5.2673e+01, 1.4521e+01, 9.6756e+00, 1.5554e+00, 4.9865e-01, 1.0000e+00],
        [4.3980e+02, 5.0393e+01, 1.4698e+01, 

In [15]:
import os
import cv2
import numpy as np
import torch
from enum import Enum
from ultralytics import YOLO


class ModelClass(Enum):
    NANO = "nano"
    MEDIUM = "med"
    LARGE = "large"


class Predictor:
    def __init__(self, img_data_folder, image_gm, image_osm, parking_mask, model_type):
        """
        Initialize the predictor with paths and model type.

        Args:
            img_data_folder (str): Folder with image data (e.g., "data/0/")
            image_gm (str): Name of Google Maps image (e.g., "cropped_gm.png")
            image_osm (str): Name of OpenStreetMap image (e.g., "cropped_osm.png")
            parking_mask (str): Name of mask image (e.g., "cropped_osm_mask.png")
            model_type (ModelClass): Type of model to use (NANO, MEDIUM, LARGE)
        """
        self.img_data_folder = img_data_folder
        self.image_gm_path = os.path.join(img_data_folder, image_gm)
        self.image_osm_path = os.path.join(img_data_folder, image_osm)
        self.parking_mask_path = os.path.join(img_data_folder, parking_mask)
        self.model_type = model_type

        # Determine device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load model
        model_name = f"./runs/obb/train_{model_type.value}/weights/best.pt"
        self.model = YOLO(model_name)
        self.model.to(self.device)

        self.results = None

    def load_images(self):
        """Load input images and mask."""
        self.img_gm = cv2.imread(self.image_gm_path)
        self.img_osm = cv2.imread(self.image_osm_path)
        self.mask = cv2.imread(self.parking_mask_path, cv2.IMREAD_GRAYSCALE)

    def is_in_mask(self, box_points):
        """
        Check if a bounding box is within the masked area.

        Args:
            box_points: Array of (x,y) points defining the bounding box
            threshold: Minimum percentage of box area that should be in masked area

        Returns:
            float: Ratio of box points in masked area
        """
        # Convert tensor to numpy array if needed
        if isinstance(box_points, torch.Tensor):
            box_points = box_points.cpu().numpy()

        # Create a polygon mask from the box points
        box_mask = np.zeros_like(self.mask)
        points = box_points.reshape((-1, 1, 2)).astype(np.int32)
        cv2.fillPoly(box_mask, [points], 255)

        # Count pixels in box that are also in mask
        intersection = cv2.bitwise_and(box_mask, self.mask)
        box_area = np.sum(box_mask > 0)
        intersection_area = np.sum(intersection > 0)

        # Return ratio of intersection area to box area
        return intersection_area / box_area if box_area > 0 else 0
    
    def predict(self):
        """
        Make predictions on the loaded images.

        Returns:
            list: List of results from the model prediction
        """
        self.load_images()

        # Make prediction on Google Maps image
        self.results = self.model.predict(self.img_gm, device=self.device)

        return self.results

    def visualize(self, mask_threshold=0.7, mask_low_confidence=0.1, visualization_type="gm", save_path=None):
        """
        Create visualization.

        Args:
            confidence_threshold: Threshold for high confidence (green boxes)
            low_confidence: Minimum threshold for low confidence (red boxes)

        Returns:
            numpy.ndarray: Visualization image with colored boxes
        """

        # Create a visualization image (copy of the original)
        assert visualization_type in ["gm", "osm"], "Invalid visualization type"
        if visualization_type == "gm":
            vis_img = self.img_gm.copy()
        else:
            vis_img = self.img_osm.copy()

        # Process each detected object
        for result in self.results:
            if hasattr(result, "obb") and result.obb is not None:
                for i, box in enumerate(result.obb.xyxyxyxy):
                    conf = float(result.obb.conf[i])

                    # Convert tensor to numpy if needed
                    if isinstance(box, torch.Tensor):
                        box = box.cpu().numpy()

                    # Check if box is in masked area and get ratio
                    mask_ratio = self.is_in_mask(box)

                    # Only process boxes with sufficient confidence and mask overlap
                    if mask_ratio >= mask_low_confidence:
                        # Choose color based on confidence
                        if mask_ratio >= mask_threshold:
                            color = (0, 255, 0)  # Green for high confidence
                        else:
                            color = (0, 0, 255)  # Red for low confidence

                        # Draw the oriented bounding box
                        points = box.reshape((-1, 1, 2)).astype(np.int32)
                        cv2.polylines(
                            vis_img, [points], isClosed=True, color=color, thickness=2
                        )

                        # Get the minimum x and y for text positioning
                        box_reshaped = box.reshape(4, 2)
                        min_x = int(np.min(box_reshaped[:, 0]))
                        min_y = int(np.min(box_reshaped[:, 1]))
                        text_pos = (min_x, min_y - 10)
                        
                        # Add confidence score text
                        cv2.putText(
                            vis_img,
                            f"{conf:.2f}",
                            text_pos,
                            cv2.FONT_HERSHEY_SIMPLEX,
                            0.5,
                            color,
                            2,
                        )
        
        # Save the visualization image if a path is provided
        if save_path:
            cv2.imwrite(os.path.join(self.img_data_folder, save_path), vis_img)
            print(f"Visualization saved to {save_path}")

        # Return the visualization image
        return vis_img


In [16]:
# Initialize the predictor
predictor = Predictor(
    img_data_folder="data/0/",
    image_gm="cropped_gm.png",
    image_osm="cropped_osm.png",
    parking_mask="cropped_osm_mask.png",
    model_type=ModelClass.LARGE,
)

results = predictor.predict()

# Generate visualization
predictor.visualize(
    mask_threshold=0.5,
    mask_low_confidence=0.1,
    visualization_type="gm",
    save_path="visualization_gm.png",
)

predictor.visualize(
    mask_threshold=0.5,
    mask_low_confidence=0.1,
    visualization_type="osm",
    save_path="visualization_osm.png",
)



0: 608x640 13.7ms
Speed: 3.4ms preprocess, 13.7ms inference, 5.0ms postprocess per image at shape (1, 3, 608, 640)
Visualization saved to visualization_gm.png
Visualization saved to visualization_osm.png


array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[201, 208, 217],
        [201, 208, 217],
        [201, 208, 217],
        ...,
        [241, 243, 245],
        [241, 243, 245],
        [241, 243, 245]],

       [[201, 208, 217],
        [201, 208, 217],
        [201, 208, 217],
        ...,
        [241, 243, 245],
        [241, 243, 245],
        [241, 243, 245]],

       [[201, 208, 217],
        [201, 208, 217],
        [201, 208, 217],
        ...,
        [241, 243, 245],
        [241, 243, 245],
        [241, 243, 245]]