In [14]:
from ultralytics import YOLO
import cv2

# List of the 10 specific classes to detect
allowed_classes = [
    'cell phone', 'remote', 'knife', 'book', 'spoon',
    'cup', 'scissors', 'fork', 'toothbrush', 'ball'
]

# Load the fine-tuned model
model = YOLO('yolo11x.pt')

# Predefined unique colors for bounding boxes of the 10 classes
box_colors = {
    'cell phone': (255, 0, 0),      # Blue
    'remote': (0, 255, 0),          # Green
    'knife': (0, 0, 255),           # Red
    'book': (255, 255, 0),          # Cyan
    'spoon': (255, 0, 255),         # Magenta
    'cup': (0, 255, 255),           # Yellow
    'scissors': (128, 0, 128),      # Purple
    'fork': (0, 128, 128),          # Teal
    'toothbrush': (128, 128, 0),    # Olive
    'ball': (0, 128, 0)             # Dark Green
}

# Ground truth for object presence in different scenarios
# Each key represents a scene
# The value is the list of objects expected to be present in that scenario
ground_truth = {
    'S1_front': ['O1'],
    'S1_left': ['O1'],
    'S1_right': ['O1'],
    'S2_front': ['O1', 'O5'],
    'S2_left': ['O1', 'O5'],
    'S2_right': ['O1', 'O5'],
    'S3_front': ['O1', 'O2', 'O5'],
    'S3_left': ['O1', 'O2', 'O5'],
    'S3_right': ['O1', 'O2', 'O5'],
    'S4_front': ['O1', 'O2', 'O3', 'O5'],
    'S4_left': ['O1', 'O2', 'O3', 'O5'],
    'S4_right': ['O1', 'O2', 'O3', 'O5'],
    'S5_front': ['O1', 'O2', 'O3', 'O5', 'O6'],
    'S5_left': ['O1', 'O2', 'O3', 'O5', 'O6'],
    'S5_right': ['O1', 'O2', 'O3', 'O5', 'O6'],
    'S6_front': ['O1', 'O2', 'O3', 'O5', 'O6', 'O8'],
    'S6_left': ['O1', 'O2', 'O3', 'O5', 'O6', 'O8'],
    'S6_right': ['O1', 'O2', 'O3', 'O5', 'O6', 'O8'],
    'S7_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O8'],
    'S7_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O8'],
    'S7_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O8'],
    'S8_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8'],
    'S8_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8'],
    'S8_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8'],
    'S9_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O10'],
    'S9_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O10'],
    'S9_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O10'],
    'S10_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9', 'O10'],
    'S10_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9', 'O10'],
    'S10_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9', 'O10'],
}

# Dictionary to store the resulting accuracy for each scenario
resulting_accuracy = {}


In [15]:
def map_object_name(name):
    """
    Maps an object name to its corresponding identifier.

    Args:
        name (str): The name of the object (e.g., 'cell phone', 'remote').

    Returns:
        str: The identifier corresponding to the object name (e.g., 'O1', 'O2').

    Raises:
        KeyError: If the provided name does not exist in the mappings.
    """
    mappings = {
        'cell phone': 'O1',
        'remote': 'O2',
        'knife': 'O3',
        'book': 'O4',
        'spoon': 'O5',
        'cup': 'O6',
        'scissors': 'O7',
        'fork': 'O8',
        'toothbrush': 'O9',
        'ball': 'O10'
    }

    return mappings[name]

In [16]:
def calculate_results(results, scene_name):
    """
    Processes detection results for a given scene, calculates detected objects,
    updates the accuracy log, and writes the comparison to a file.

    Args:
        results (list): A list of detection results from the YOLO model.
                        Each detection result includes bounding boxes and class predictions.
        scene_name (str): The name of the scene (e.g., 'S1_front') being evaluated.

    Writes:
        The comparison of detected objects and expected objects to a text file ('../testing.txt').
    """
    # Keep track of detected classes for the current scene
    drawn_classes = set()

    for box in results[0].boxes:
        label = int(box.cls.item())
        name = model.names[label]

        if name in allowed_classes and name not in drawn_classes:
            object_number = map_object_name(name)
            drawn_classes.add(name)

            # Update the resulting accuracy dictionary
            if scene_name not in resulting_accuracy:
                resulting_accuracy[scene_name] = [object_number]
            else:
                resulting_accuracy[scene_name].append(object_number)

    # Write results to testing.txt
    with open('../testing.txt', 'a') as f:
        f.write(f'Scene: {scene_name}\n')
        f.write('Detected Objects: {}\n'.format(', '.join(
            sorted(set(resulting_accuracy[scene_name]))
        )))
        f.write('Expected Objects: {}\n\n'.format(', '.join(
            sorted(set(ground_truth[scene_name]))
        )))


In [17]:
def get_accuracy():
    """
    Calculates the accuracy metrics for object detection across all scenes.
    
    The metrics include True Positives (TP), False Positives (FP), True Negatives (TN), 
    False Negatives (FN), precision, recall, F1-score, and overall accuracy.

    Returns:
        None: Prints the overall precision, recall, F1-score, and accuracy to the console.
    """
    results_per_scene = []

    for scene_name in ground_truth:
        detected_objects = list(set(resulting_accuracy.get(scene_name, []))) # Empty if nothing is detected
        actual_objects = ground_truth[scene_name]

        # Calculate metrics for the scene
        TP = len(set(actual_objects).intersection(set(detected_objects)))  # True Positives
        FP = len(set(detected_objects) - set(actual_objects))              # False Positives
        FN = len(set(actual_objects) - set(detected_objects))              # False Negatives
        TN = len(allowed_classes) - TP - FP - FN                           # True Negatives

        results_per_scene.append({
            'Scene': scene_name,
            'TP': TP,
            'FP': FP,
            'TN': TN,
            'FN': FN
        })

    # Calculate overall metrics
    total_TP = sum([res['TP'] for res in results_per_scene])
    total_FP = sum([res['FP'] for res in results_per_scene])
    total_TN = sum([res['TN'] for res in results_per_scene])
    total_FN = sum([res['FN'] for res in results_per_scene])

    precision = total_TP / (total_TP + total_FP)
    recall = total_TP / (total_TP + total_FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (total_TP + total_TN) / (total_TP + total_FP + total_TN + total_FN)

    # Print overall results
    print("\nOverall Results:")
    print("Precision: {:.2f}%".format(precision * 100))
    print("Recall: {:.2f}%".format(recall * 100))
    print("F1-Score: {:.2f}%".format(f1_score * 100))
    print("Accuracy: {:.2f}%".format(accuracy * 100))


In [18]:
def draw_bounding_box(results, image_path):
    """
    Draws bounding boxes around detected objects on the input image for allowed classes.

    Args:
        results (list): A list of detection results from the YOLO model.
                        Each result includes bounding boxes and class predictions.
        image_path (str): Path to the input image on which bounding boxes will be drawn.

    Returns:
        numpy.ndarray: The image with bounding boxes and labels drawn for allowed classes.
    """
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Failed to read the image from path: {image_path}")

    # Set to keep track of already drawn classes
    drawn_classes = set()

    for box in results[0].boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])  # Convert bounding box coordinates to integers
        label = int(box.cls.item())
        name = model.names[label]

        if name in allowed_classes and name not in drawn_classes:
            color = box_colors[name]

            cv2.rectangle(image, (x1, y1), (x2, y2), color, 12)
            cv2.putText(
                image, name, (x1, y1 - 15),
                cv2.FONT_HERSHEY_SIMPLEX, 2.5, color, 3
            )

            drawn_classes.add(name)

    return image


In [19]:
# Clear the file before starting
with open('../testing.txt', 'w') as f:
    f.write("")

# Process each scene and perform detection
for i in range(len(allowed_classes)):
    for side in ["front", "left", "right"]:
        scene_name = f"S{i + 1}_{side}"
        image_path = f"../Scenes/{scene_name}.jpg"

        # Perform object detection using the YOLO model
        results = model(image_path)

        # Calculate and log detection results to testing.txt
        calculate_results(results, scene_name)

        image = draw_bounding_box(results, image_path)

        # Save the image with bounding boxes
        output_path = f"../Detected Objects/{scene_name}_bb.jpg"
        cv2.imwrite(output_path, image)



image 1/1 /Users/saifal-dinali/Desktop/Projects/Python Projects/OpenCV Projects/Image-Recognition-Project/Code/../Scenes/S1_front.jpg: 640x480 1 cell phone, 373.1ms
Speed: 10.3ms preprocess, 373.1ms inference, 13.9ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/saifal-dinali/Desktop/Projects/Python Projects/OpenCV Projects/Image-Recognition-Project/Code/../Scenes/S1_left.jpg: 640x480 1 cell phone, 385.0ms
Speed: 2.4ms preprocess, 385.0ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/saifal-dinali/Desktop/Projects/Python Projects/OpenCV Projects/Image-Recognition-Project/Code/../Scenes/S1_right.jpg: 640x480 1 cell phone, 316.0ms
Speed: 1.6ms preprocess, 316.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/saifal-dinali/Desktop/Projects/Python Projects/OpenCV Projects/Image-Recognition-Project/Code/../Scenes/S2_front.jpg: 480x640 1 fork, 1 spoon, 1 cell phone, 290.5ms
Speed: 1.5ms preprocess,

## Overall Object Detection and Identification

In [20]:
get_accuracy()


Overall Results:
Precision: 99.30%
Recall: 86.06%
F1-Score: 92.21%
Accuracy: 92.00%


## Panoramic Image Stitching and Object Detection

In [26]:
import numpy as np
import cv2


class ImageStitching:
    def __init__(self, q_image, t_image):
        """
        Initializes the ImageStitching object and sets the smoothing window size.

        Args:
            q_image (numpy.ndarray): The first input image.
            t_image (numpy.ndarray): The second input image.
        """
        super().__init__()
        q_photo_width = q_image.shape[1]    # Width of query photo
        t_photo_width = t_image.shape[1]    # Width of training photo
        min_width = min(q_photo_width, t_photo_width)
        smoothing_window_factor = 0.10          # Smoothing window percentage [0.00, 1.00]
        self.smoothing_window_size = max(
            100, 
            min(smoothing_window_factor * min_width, 1000)
        )

    @staticmethod
    def convert_grayscale(image):
        """
        Converts an image to grayscale.

        Args:
            image (numpy.ndarray): Input image.

        Returns:
            tuple: The original image and its grayscale version.
        """
        photo_gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        return image, photo_gray

    @staticmethod
    def sift_detector(image):
        """
        Applies the SIFT algorithm to extract keypoints and features.

        Args:
            image (numpy.ndarray): Input image.

        Returns:
            tuple: Keypoints and features extracted from the image.
        """
        descriptor = cv2.SIFT_create()
        keypoints, features = descriptor.detectAndCompute(image, None)
        return keypoints, features

    def create_and_match_keypoints(self, features_train_image, features_query_image):
        """
        Matches keypoints from SIFT features using brute-force matching with L2 norm.

        Args:
            features_train_image (numpy.ndarray): SIFT features of the train image.
            features_query_image (numpy.ndarray): SIFT features of the query image.

        Returns:
            list: Sorted list of raw matches based on distance.
        """
        bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
        best_matches = bf.match(features_train_image, features_query_image)
        raw_matches = sorted(best_matches, key=lambda x: x.distance)
        return raw_matches

    def compute_homography(self, t_image_keypoints, q_image_keypoints, matches, reprojThresh):
        """
        Computes the Homography matrix using matched keypoints and RANSAC.

        Args:
            t_image_keypoints (list): Keypoints from the train image.
            q_image_keypoints (list): Keypoints from the query image.
            matches (list): Matched keypoints between the two images.
            reprojThresh (float): Threshold for RANSAC reprojection error.

        Returns:
            tuple or None: (matches, Homography matrix, status) if successful, else None.
        """
        t_image_keypoints = np.float32([keypoint.pt for keypoint in t_image_keypoints])
        q_image_keypoints = np.float32([keypoint.pt for keypoint in q_image_keypoints])

        if len(matches) >= 4:
            t_points = np.float32([t_image_keypoints[m.queryIdx] for m in matches])
            q_points = np.float32([q_image_keypoints[m.trainIdx] for m in matches])

            H, status = cv2.findHomography(t_points, q_points, cv2.RANSAC, reprojThresh)
            return matches, H, status
        else:
            print("Minimum match count not satisfied. Cannot compute homography.")
            return None

    def create_mask(self, query_image, train_image, version):
        """
        Creates a mask for blending images using a smoothing window.

        Args:
            query_image (numpy.ndarray): Query image.
            train_image (numpy.ndarray): Train image.
            version (str): Either 'left_image' or 'right_image' for the mask.

        Returns:
            numpy.ndarray: A 3-channel mask for blending.
        """
        q_image_height = query_image.shape[0]
        q_photo_width = query_image.shape[1]
        t_photo_width = train_image.shape[1]

        height_panorama = q_image_height
        width_panorama = q_photo_width + t_photo_width

        offset = int(self.smoothing_window_size / 2)
        barrier = query_image.shape[1] - offset
        mask = np.zeros((height_panorama, width_panorama))

        if version == "left_image":
            mask[:, barrier - offset : barrier + offset] = np.tile(
                np.linspace(1, 0, 2 * offset).T, (height_panorama, 1)
            )
            mask[:, : barrier - offset] = 1
        else:
            mask[:, barrier - offset : barrier + offset] = np.tile(
                np.linspace(0, 1, 2 * offset).T, (height_panorama, 1)
            )
            mask[:, barrier + offset :] = 1
        return cv2.merge([mask, mask, mask])

    def blending_smoothing(self, query_image, train_image, homography_matrix):
        """
        Blends query and train images into a panorama using a homography matrix.

        Args:
            query_image (numpy.ndarray): Query image.
            train_image (numpy.ndarray): Train image.
            homography_matrix (numpy.ndarray): Homography matrix to map images.

        Returns:
            numpy.ndarray: The resulting panoramic image.
        """
        left_image_heigh = query_image.shape[0]
        left_image_width = query_image.shape[1]
        right_image_width = train_image.shape[1]

        height_panorama = left_image_heigh
        width_panorama = left_image_width + right_image_width

        # Initialize the panorama and blend images
        left_panorama = np.zeros((height_panorama, width_panorama, 3))
        left_mask = self.create_mask(query_image, train_image, version="left_image")
        left_panorama[0 : query_image.shape[0], 0 : query_image.shape[1], :] = query_image
        left_panorama *= left_mask

        right_mask = self.create_mask(query_image, train_image, version="right_image")
        right_panorama = cv2.warpPerspective(
            train_image, homography_matrix, (width_panorama, height_panorama)
        ) * right_mask

        # Combine the two panoramas
        result = left_panorama + right_panorama

        # Crop out extra black space
        rows, cols = np.where(result[:, :, 0] != 0)
        min_row, max_row = min(rows), max(rows) + 1
        min_col, max_col = min(cols), max(cols) + 1

        final_result = result[min_row:max_row, min_col:max_col, :]
        return final_result

In [27]:
import cv2
import numpy as np


def forward_pass(q_image, t_image):
    """
    Runs a forward pass using the ImageStitching class to stitch two images
    into a seamless panorama. It includes grayscale conversion, feature detection,
    keypoint matching, homography computation, and blending.

    Args:
        q_image (numpy.ndarray): The query image (left image for stitching).
        t_image (numpy.ndarray): The train image (right image for stitching).

    Returns:
        tuple:
            - result_rgb (numpy.ndarray): The final panoramic image in RGB format.
            - mapped_feature_image_rgb (numpy.ndarray): A visualization of matched keypoints.
        
        If stitching fails, returns:
            - str: Error message indicating stitching failure.
    """
    # Initialize the ImageStitching object
    image_stitching = ImageStitching(q_image, t_image)

    # Convert both images to grayscale
    _, q_image_gray = image_stitching.convert_grayscale(q_image)  # Left image
    _, t_image_gray = image_stitching.convert_grayscale(t_image)  # Right image

    # Detect keypoints and compute features using SIFT
    t_image_keypoints, features_train_image = image_stitching.sift_detector(t_image_gray)
    q_image_keypoints, features_query_image = image_stitching.sift_detector(q_image_gray)

    # Match keypoints between the two images
    matches = image_stitching.create_and_match_keypoints(features_train_image, features_query_image)

    # Visualize the keypoint matches
    mapped_feature_image = cv2.drawMatches(
        t_image,
        t_image_keypoints,
        q_image,
        q_image_keypoints,
        matches[:100],  # Show the top 100 matches
        None,
        flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS
    )

    # Compute the homography matrix to map images to a common plane
    M = image_stitching.compute_homography(t_image_keypoints, q_image_keypoints, matches, reprojThresh=4)
    if M is None:
        return "Error cannot stitch images"

    (matches, homography_matrix, _) = M

    # Blend the images to create the panorama
    result = image_stitching.blending_smoothing(q_image, t_image, homography_matrix)

    # Convert the mapped features and result to RGB for output
    mapped_float_32 = np.float32(mapped_feature_image)
    result_float32 = np.float32(result)
    result_rgb = cv2.cvtColor(result_float32, cv2.COLOR_BGR2RGB)
    mapped_feature_image_rgb = cv2.cvtColor(mapped_float_32, cv2.COLOR_BGR2RGB)

    return result_rgb, mapped_feature_image_rgb


In [28]:
import numpy as np
import cv2


def stitch_images(image_list, no_of_images):
    """
    Stitches two images from a provided list using the forward_pass function
    to create a panoramic image.

    Args:
        image_list (list): List of input images (numpy arrays).
        no_of_images (int): Number of images in the list. The function uses the 
                            last two images in the list for stitching.

    Returns:
        tuple:
            - result_rgb (numpy.ndarray): The final panoramic image in RGB format.
            - mapped_image_rgb (numpy.ndarray): Visualization of matched keypoints.

    Raises:
        ValueError: If `no_of_images` is less than 2 or if the image list does not contain enough images.
    """
    # Ensure sufficient images are provided
    if no_of_images < 2:
        raise ValueError("At least two images are required for stitching.")
    if len(image_list) < no_of_images:
        raise ValueError("The number of images in the list is less than `no_of_images`.")

    # Perform stitching using the last two images in the list
    result, mapped_image = forward_pass(
        q_image=image_list[no_of_images - 2],
        t_image=image_list[no_of_images - 1],
    )

    # Convert mapped image to uint8 and RGB format for visualization
    mapped_image_int8 = np.uint8(mapped_image)
    mapped_image_rgb = cv2.cvtColor(mapped_image_int8, cv2.COLOR_BGR2RGB)

    # Convert result image to uint8 and RGB format for output
    result_int8 = np.uint8(result)
    result_rgb = cv2.cvtColor(result_int8, cv2.COLOR_BGR2RGB)

    return result_rgb, mapped_image_rgb


In [29]:
import os
import cv2


def panorama_main(image_list):
    """
    Main function to create a panoramic image by stitching multiple images together.
    Saves the final panorama and a visualization of keypoint matches in their respective directories.

    Args:
        image_list (list): List of input images (numpy arrays) to be stitched sequentially.

    Outputs:
        - The final panoramic image is saved in the `../Panorama/` folder.
        - The keypoint mapping visualization is saved in the `../Keypoints/` folder.

    Raises:
        ValueError: If the `image_list` is empty or contains fewer than two images.
    """
    # Ensure the image list is valid
    if len(image_list) < 2:
        raise ValueError("At least two images are required to create a panorama.")

    # Initialize the result with the first image
    result = image_list[0]

    for i in range(1, len(image_list)):
        # Prepare a temporary list with the current result and the next image
        temp_list = [result, image_list[i]]

        # Stitch the images together
        result, mapped_image = stitch_images(temp_list, len(temp_list))

    # Create directories to save the outputs
    os.makedirs("../Panorama", exist_ok=True)
    os.makedirs("../Keypoints", exist_ok=True)

    # Save the panoramic result
    cv2.imwrite("../Panorama/Panorama.jpg", result)
    cv2.imwrite("../Keypoints/mapped_image.jpg", mapped_image)

    print(f"Panoramic image saved!")
    print(f"Keypoint mapping saved!")


In [30]:
import os
import cv2


# Load images from the specified directory
image_dir = "../Panorama"
image_list = []

for i in range(10):
    scene_name = f"S{i + 1}"
    image_path = f"{image_dir}/{scene_name}.jpg"
    img = cv2.imread(image_path)

    if img is not None:
        image_list.append(img)
    else:
        print(f"Image {scene_name}.jpg could not be loaded. Skipping.")

# Create the panorama image
if len(image_list) > 1:
    panorama_main(image_list)
else:
    print("Not enough images to create a panorama. Ensure at least two valid images are present.")


Panoramic image saved!
Keypoint mapping saved!


In [31]:
import cv2

# Path to the generated panoramic image
image_path = "../Panorama/Panorama.jpg"

# Perform object detection using the model
results = model(image_path)

# Draw bounding boxes around detected objects
image_with_bboxes = draw_bounding_box(results, image_path)

# Save the image with bounding boxes to a new file
output_path = "../Panorama/Panorama_bb.jpg"
cv2.imwrite(output_path, image_with_bboxes)

print(f"Panoramic image with bounding boxes saved at: {output_path}")


image 1/1 /Users/saifal-dinali/Desktop/Projects/Python Projects/OpenCV Projects/Image-Recognition-Project/Code/../Panorama/Panorama.jpg: 352x640 1 cup, 2 forks, 1 knife, 1 spoon, 1 remote, 1 cell phone, 1 book, 1 scissors, 1 toothbrush, 740.3ms
Speed: 4.4ms preprocess, 740.3ms inference, 1.2ms postprocess per image at shape (1, 3, 352, 640)
Panoramic image with bounding boxes saved at: ../Panorama/Panorama_bb.jpg
