In [17]:
from ultralytics import YOLO
import cv2

# List of the 10 specific classes to detect
allowed_classes = ['cell phone', 'remote', 'knife', 'book', 'spoon', 'cup', 'scissors', 'fork', 'toothbrush', 'ball']
model = YOLO('yolo11x.pt') 

# Predefined unique colors for the 10 classes
box_colors = {
    'cell phone': (255, 0, 0),
    'remote': (0, 255, 0),
    'knife': (0, 0, 255),
    'book': (255, 255, 0),
    'spoon': (255, 0, 255),
    'cup': (0, 255, 255),
    'scissors': (128, 0, 128),
    'fork': (0, 128, 128),
    'toothbrush': (128, 128, 0),
    'ball': (0, 128, 0)
}

ground_truth = {
    'S1_front': ['O1'],
    'S1_left': ['O1'],
    'S1_right': ['O1'],
    'S2_front': ['O1', 'O5'],
    'S2_left': ['O1', 'O5'],
    'S2_right': ['O1', 'O5'],
    'S3_front': ['O1', 'O2', 'O5'],
    'S3_left': ['O1', 'O2', 'O5'],
    'S3_right': ['O1', 'O2', 'O5'],
    'S4_front': ['O1', 'O2', 'O3', 'O5'],
    'S4_left': ['O1', 'O2', 'O3', 'O5'],
    'S4_right': ['O1', 'O2', 'O3', 'O5'],
    'S5_front': ['O1', 'O2', 'O3', 'O5', 'O6'],
    'S5_left': ['O1', 'O2', 'O3', 'O5', 'O6'],
    'S5_right': ['O1', 'O2', 'O3', 'O5', 'O6'],
    'S6_front': ['O1', 'O2', 'O3', 'O5', 'O6', 'O8'],
    'S6_left': ['O1', 'O2', 'O3', 'O5', 'O6', 'O8'],
    'S6_right': ['O1', 'O2', 'O3', 'O5', 'O6', 'O8'],
    'S7_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O8'],
    'S7_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O8'],
    'S7_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O8'],
    'S8_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8'],
    'S8_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8'],
    'S8_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8'],
    'S9_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O10'],
    'S9_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O10'],
    'S9_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O10'],
    'S10_front': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9', 'O10'],
    'S10_left': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9', 'O10'],
    'S10_right': ['O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9', 'O10'],
}

resulting_accuracy = {}

In [18]:
def map_object_name(name):
    mappings = {
        'cell phone': 'O1',
        'remote': 'O2',
        'knife': 'O3',
        'book': 'O4',
        'spoon': 'O5',
        'cup': 'O6',
        'scissors': 'O7',
        'fork': 'O8',
        'toothbrush': 'O9',
        'ball': 'O10'
    }

    return mappings[name]

In [19]:
def calculate_results(results, scene_name):
    # Iterate through detected objects and draw bounding boxes for allowed classes
    for box in results[0].boxes:
        label = int(box.cls.item())
        name = model.names[label]

        drawn_classes = set()
        
        # Only process if the detected class is in the allowed list and not already drawn
        if name in allowed_classes and name not in drawn_classes:
            object_number = map_object_name(name)
            drawn_classes.add(name)

            if scene_name not in resulting_accuracy:
                resulting_accuracy[scene_name] = [object_number]
            else:
                resulting_accuracy[scene_name].append(object_number)
    
    with open('../testing.txt', 'a') as f:
        f.write(f'Scene: {scene_name}\n')
        f.write('Detected Objects: {}\n\n'.format(', '.join(
            sorted(resulting_accuracy[scene_name])
        )))
            


In [20]:
def get_accuracy():
    results_per_scene = []
    for scene_name in ground_truth:
        detected_objects = resulting_accuracy[scene_name]
        actual_objects = ground_truth[scene_name]
        TP = len(set(actual_objects).intersection(set(detected_objects)))
        FP = len(set(detected_objects) - set(actual_objects))
        FN = len(set(actual_objects) - set(detected_objects))
        TN = len(allowed_classes) - TP - FP - FN

        results_per_scene.append({
            'Scene': scene_name,
            'TP': TP,
            'FP': FP,
            'TN': TN,
            'FN': FN
        })
    
    # Calculate overall precision, recall, F1-score, and accuracy
    total_TP = sum([res['TP'] for res in results_per_scene])
    total_FP = sum([res['FP'] for res in results_per_scene])
    total_TN = sum([res['TN'] for res in results_per_scene])
    total_FN = sum([res['FN'] for res in results_per_scene])

    precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0
    recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (total_TP + total_TN) / (total_TP + total_FP + total_TN + total_FN) if (
                total_TP + total_FP + total_TN + total_FN) > 0 else 0

    print("\nOverall Results:")
    print("Precision: {:.2f}%".format(precision * 100))
    print("Recall: {:.2f}%".format(recall * 100))
    print("F1-Score: {:.2f}%".format(f1_score * 100))
    print("Accuracy: {:.2f}%".format(accuracy * 100))

In [21]:
def draw_bounding_box(results, image_path):
    # Get the original image for plotting
    image = cv2.imread(image_path)

    # Set to keep track of already drawn classes
    drawn_classes = set()

    # Iterate through detected objects and draw bounding boxes for allowed classes
    for box in results[0].boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0])  # Bounding box coordinates
        label = int(box.cls.item())
        name = model.names[label]

        # Only process if the detected class is in the allowed list and not already drawn
        if name in allowed_classes and name not in drawn_classes:
            color = box_colors[name]  # Get the predefined color for this class

            # Draw the bounding box on the image with thicker lines
            cv2.rectangle(image, (x1, y1), (x2, y2), color, 12)  # Increased thickness

            # Display the name above the bounding box with larger text
            cv2.putText(image, name, (x1, y1 - 15), cv2.FONT_HERSHEY_SIMPLEX, 2.5, color, 3)  # Larger font size and thickness

            # Add the class to the drawn set
            drawn_classes.add(name)

    return image

In [22]:
# Clear the file before starting
with open('../testing.txt', 'w') as f:
    f.write("")

for i in range(len(allowed_classes)):
    for side in ["front", "left", "right"]:
        scene_name = f"S{i + 1}_{side}"
        image_path = f"../Scenes/{scene_name}.jpg"

        # Perform object detection
        results = model(image_path)

        # Extract the image with bounding boxes drawn
        # detected_image = results[0].plot()  # Get the image with bounding boxes drawn
        calculate_results(results, scene_name)
        image = draw_bounding_box(results, image_path)
        # Display the detected image
        # display_image(detected_image, "Detected Objects in Scene")

        output_path = f"../Detected Objects/{scene_name}_bb.jpg"
        cv2.imwrite(output_path, image)


image 1/1 /Users/saifal-dinali/Desktop/Project/Code/../Scenes/S1_front.jpg: 640x480 1 cell phone, 306.5ms
Speed: 6.7ms preprocess, 306.5ms inference, 8.7ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/saifal-dinali/Desktop/Project/Code/../Scenes/S1_left.jpg: 640x480 1 cell phone, 258.1ms
Speed: 1.4ms preprocess, 258.1ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/saifal-dinali/Desktop/Project/Code/../Scenes/S1_right.jpg: 640x480 1 cell phone, 255.1ms
Speed: 1.4ms preprocess, 255.1ms inference, 0.3ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 /Users/saifal-dinali/Desktop/Project/Code/../Scenes/S2_front.jpg: 480x640 1 fork, 1 spoon, 1 cell phone, 258.4ms
Speed: 1.4ms preprocess, 258.4ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /Users/saifal-dinali/Desktop/Project/Code/../Scenes/S2_left.jpg: 480x640 1 spoon, 1 cell phone, 255.0ms
Speed: 1.3ms preprocess, 255.0ms inference, 0.5ms

In [23]:
get_accuracy()


Overall Results:
Precision: 99.30%
Recall: 86.06%
F1-Score: 92.21%
Accuracy: 92.00%


In [None]:
import numpy as np
import cv2


class ImageStitching:
    """containts the utilities required to stitch images"""

    def __init__(self, query_photo, train_photo):
        super().__init__()
        width_query_photo = query_photo.shape[1]
        width_train_photo = train_photo.shape[1]
        lowest_width = min(width_query_photo, width_train_photo)
        smoothing_window_percent = 0.10 # consider increasing or decreasing[0.00, 1.00] 
        self.smoothing_window_size = max(100, min(smoothing_window_percent * lowest_width, 1000))

    def convert_grayscale(self, image):
        photo_gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

        return image, photo_gray



    @staticmethod
    def _sift_detector(image):
        """Applies SIFT algorithm to the given image

        Args:
            image (numpy array): input image

        Returns:
            keypoints, features
        """
        descriptor = cv2.SIFT_create()
        keypoints, features = descriptor.detectAndCompute(image, None)

        return keypoints, features

    def create_and_match_keypoints(self, features_train_image, features_query_image):
        """Creates and Matches keypoints from the SIFT features using Brute Force matching
        by checking the L2 norm of the feature vector

        Args:
            features_train_image: SIFT features of train image
            features_query_image: SIFT features of query image

        Returns:
            matches (List): matches in features of train and query image
        """
        bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)

        best_matches = bf.match(features_train_image, features_query_image)
        raw_matches = sorted(best_matches, key=lambda x: x.distance)

        return raw_matches

    def compute_homography(
        self, keypoints_train_image, keypoints_query_image, matches, reprojThresh
    ):
        """Computes the Homography to map images to a single plane,
        uses RANSAC algorithm to find the best matches iteratively.

        Args:
            keypoints_train_image: keypoints found using SIFT in train image
            keypoints_query_image: keypoints found using SIFT in query image
            matches: matches found using Brute Force
            reprojThresh: threshold for error

        Returns:
            M (Tuple): (matches, Homography matrix, status)
        """
        keypoints_train_image = np.float32(
            [keypoint.pt for keypoint in keypoints_train_image]
        )
        keypoints_query_image = np.float32(
            [keypoint.pt for keypoint in keypoints_query_image]
        )

        if len(matches) >= 4:
            points_train = np.float32(
                [keypoints_train_image[m.queryIdx] for m in matches]
            )
            points_query = np.float32(
                [keypoints_query_image[m.trainIdx] for m in matches]
            )

            H, status = cv2.findHomography(
                points_train, points_query, cv2.RANSAC, reprojThresh
            )

            return (matches, H, status)

        else:
            print(f"Minimum match count not satisfied cannot get homopgrahy")
            return None

    def create_mask(self, query_image, train_image, version):
        """Creates the mask using query and train images for blending the images,
        using a gaussian smoothing window/kernel

        Args:
            query_image (numpy array)
            train_image (numpy array)
            version (str) == 'left_image' or 'right_image'

        Returns:
            masks
        """
        height_query_photo = query_image.shape[0]
        width_query_photo = query_image.shape[1]
        width_train_photo = train_image.shape[1]
        height_panorama = height_query_photo
        width_panorama = width_query_photo + width_train_photo
        offset = int(self.smoothing_window_size / 2)
        barrier = query_image.shape[1] - int(self.smoothing_window_size / 2)
        mask = np.zeros((height_panorama, width_panorama))
        if version == "left_image":
            mask[:, barrier - offset : barrier + offset] = np.tile(
                np.linspace(1, 0, 2 * offset).T, (height_panorama, 1)
            )
            mask[:, : barrier - offset] = 1
        else:
            mask[:, barrier - offset : barrier + offset] = np.tile(
                np.linspace(0, 1, 2 * offset).T, (height_panorama, 1)
            )
            mask[:, barrier + offset :] = 1
        return cv2.merge([mask, mask, mask])

    def blending_smoothing(self, query_image, train_image, homography_matrix):
        """blends both query and train image via the homography matrix,
        and ensures proper blending and smoothing using masks created in create_masks()
        to give a seamless panorama.

        Args:
            query_image (numpy array)
            train_image (numpy array)
            homography_matrix (numpy array): Homography to map images to a single plane

        Returns:
            panoramic image (numpy array)
        """
        height_img1 = query_image.shape[0]
        width_img1 = query_image.shape[1]
        width_img2 = train_image.shape[1]
        height_panorama = height_img1
        width_panorama = width_img1 + width_img2

        panorama1 = np.zeros((height_panorama, width_panorama, 3))
        mask1 = self.create_mask(query_image, train_image, version="left_image")
        panorama1[0 : query_image.shape[0], 0 : query_image.shape[1], :] = query_image
        panorama1 *= mask1
        mask2 = self.create_mask(query_image, train_image, version="right_image")
        panorama2 = (
            cv2.warpPerspective(
                train_image, homography_matrix, (width_panorama, height_panorama)
            )
            * mask2
        )
        result = panorama1 + panorama2

        # remove extra blackspace
        rows, cols = np.where(result[:, :, 0] != 0)
        min_row, max_row = min(rows), max(rows) + 1
        min_col, max_col = min(cols), max(cols) + 1

        final_result = result[min_row:max_row, min_col:max_col, :]

        return final_result

In [None]:
import cv2
import numpy as np


def forward(query_photo, train_photo):
    """Runs a forward pass using the ImageStitching() class in utils.py.
    Takes in a query image and train image and runs entire pipeline to return
    a panoramic image.

    Args:
        query_photo (numpy array): query image
        train_photo (nnumpy array): train image

    Returns:
        result image (numpy array): RGB result image
    """
    image_stitching = ImageStitching(query_photo, train_photo)
    _, query_photo_gray = image_stitching.convert_grayscale(query_photo)  # left image
    _, train_photo_gray = image_stitching.convert_grayscale(train_photo)  # right image

    keypoints_train_image, features_train_image = image_stitching._sift_detector(
        train_photo_gray
    )
    keypoints_query_image, features_query_image = image_stitching._sift_detector(
        query_photo_gray
    )

    matches = image_stitching.create_and_match_keypoints(
        features_train_image, features_query_image
    )

    mapped_feature_image = cv2.drawMatches(
                        train_photo,
                        keypoints_train_image,
                        query_photo,
                        keypoints_query_image,
                        matches[:100],
                        None,
                        flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)
    
    M = image_stitching.compute_homography(
        keypoints_train_image, keypoints_query_image, matches, reprojThresh=4
    )

    if M is None:
        return "Error cannot stitch images"

    (matches, homography_matrix, status) = M

    result = image_stitching.blending_smoothing(
        query_photo, train_photo, homography_matrix
    )
    # mapped_image = cv2.drawMatches(train_photo, keypoints_train_image, query_photo, keypoints_query_image, matches[:100], None, flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)
    mapped_float_32 = np.float32(mapped_feature_image)
    result_float32 = np.float32(result)
    result_rgb = cv2.cvtColor(result_float32, cv2.COLOR_BGR2RGB)
    mapped_feature_image_rgb = cv2.cvtColor(mapped_float_32, cv2.COLOR_BGR2RGB)
    
    return result_rgb, mapped_feature_image_rgb


In [26]:
import numpy as np
import cv2


def stich_images(image_list, no_of_images):
    result, mapped_image = forward(
        query_photo=image_list[no_of_images - 2],
        train_photo=image_list[no_of_images - 1],
    )

    mapped_image_int8 = np.uint8(mapped_image)
    mapped_image_rgb = cv2.cvtColor(mapped_image_int8, cv2.COLOR_BGR2RGB)

    result_int8 = np.uint8(result)
    result_rgb = cv2.cvtColor(result_int8, cv2.COLOR_BGR2RGB)
    return result_rgb, mapped_image_rgb

In [None]:
import os
import cv2


def panorama_main(image_list):
    """Main function of the Repository.
    Automatically uses the images in the specified directory 
    to create and export a panoramic image in the /outputs/ folder.

    Args:
        image_dir (str): Directory containing input images
    """
    result = image_list[0]
    temp_list = []
    for i in range(1, len(image_list)):
        temp_list = [result, image_list[i]]
        # Process images to create a panorama
        result, mapped_image = stich_images(temp_list, len(temp_list))
        temp_list = []
    
    # Save the results to the outputs folder
    os.makedirs("../Panorama", exist_ok=True)
    cv2.imwrite("../Panorama/Panorama.jpg", result)
    # cv2.imwrite("outputs/mapped_image.jpg", mapped_image)

    print(f"Panoramic image saved at: outputs/panorama_image.jpg")    

In [None]:
image_dir = "../Temp3"
image_list = []
for i in range(10):
    scene_name = f"S{i + 1}"
    image_path = f"{image_dir}/{scene_name}.jpg"
    img = cv2.imread(image_path)
    if img is not None:
        image_list.append(img)

panorama_main(image_list)

Panoramic image saved at: outputs/panorama_image.jpg


In [41]:
image_path = f"../Panorama/Panorama.jpg"

# Perform object detection
results = model(image_path)

# Extract the image with bounding boxes drawn
# detected_image = results[0].plot()  # Get the image with bounding boxes drawn
image = draw_bounding_box(results, image_path)
# Display the detected image
# display_image(detected_image, "Detected Objects in Scene")

output_path = f"../Panorama/Panorama_bb.jpg"
cv2.imwrite(output_path, image)


image 1/1 /Users/saifal-dinali/Desktop/Project/Code/../Panorama/Panorama.jpg: 352x640 1 cup, 2 forks, 1 knife, 1 spoon, 1 remote, 1 cell phone, 1 book, 1 scissors, 1 toothbrush, 738.8ms
Speed: 12.8ms preprocess, 738.8ms inference, 11.7ms postprocess per image at shape (1, 3, 352, 640)


True