In [None]:
import torch
import requests
from pathlib import Path
from PIL import Image

#Import RTDETR
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor

token = "Insert Hugging Face Token Here"

#Load pretrained image processor and model
image_processor = RTDetrImageProcessor.from_pretrained(
    "PekingU/rtdetr_v2_r50vd",
    token=token
)

model = RTDetrV2ForObjectDetection.from_pretrained(
    "PekingU/rtdetr_v2_r50vd",
    token=token
)

In [22]:
#Inference function that takes in an image, image processor, and model and returns the results
def inference(imgs, img_proc, model):
    test_imgs = imgs
    #model.eval()
    with torch.no_grad():
        #The image processor packs the images into a dictionary
        trans_img = img_proc(images=imgs, return_tensors='pt')
        
        #The images can be unpacked via the ** operator
        results = model(**trans_img)

        return results
       


In [23]:
#Import/Get dataset to run inference on (Must be PIL Images,Tensors, or Arrays)
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp'}

#Get images from a image folder
def get_images_from_folder(folder_path):
    folder = Path(folder_path)
    
    if not folder.exists():
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    # Collect all image files
    image_files = []
    for file_path in folder.iterdir():
        if file_path.suffix.lower() in IMAGE_EXTENSIONS:
            image_files.append(file_path)
    
    return sorted(image_files)  # Sort for consistent order

folder_paths = get_images_from_folder(r"C:\Users\Bo_jr\Documents\Datasets\SampleDataset")

for path in folder_paths:
    image = Image.open(path).convert('RGB')
    


In [25]:
#Test model quality

#Get images from sample dataset and run inference
outputs = []

folder_paths = get_images_from_folder(r"C:\Users\Bo_jr\Documents\Datasets\SampleDataset")
for path in folder_paths:
    image = Image.open(path).convert('RGB')
    output = inference(image, image_processor, model)
    #Converts the raw output into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) and append the output to the outputs list
    outputs.append(image_processor.post_process_object_detection(output, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.5))

#Unpack the outputs and print the results
for result in outputs:
    for res in result:
        #Simultaneously stores each detection result into their respective variables
        for score, label, box in zip(res["scores"], res["labels"], res["boxes"]):
            if box.numel() != 0:
                #Convert the tensor values to scalars
                score, label = score.item() * 100, label.item()
                
                #Convert box values to floats
                box = [round(i, 2) for i in box.tolist()]
                print(f"{model.config.id2label[label]}: {score:.2f}% {box}")
        
   

vase: 63.55% [15.18, 0.61, 40.28, 63.5]
broccoli: 70.25% [0.02, 10.58, 64.02, 56.08]
diningtable: 52.46% [-0.0, 0.1, 64.0, 63.71]
person: 64.26% [8.69, 16.39, 27.23, 47.81]
orange: 62.54% [6.22, 36.39, 24.7, 49.43]
apple: 60.62% [31.09, 48.29, 45.0, 61.66]
orange: 57.01% [22.42, 31.72, 33.13, 42.16]
orange: 55.38% [-0.0, 20.42, 33.34, 52.17]
orange: 53.97% [0.01, 22.79, 9.07, 36.14]
orange: 51.59% [4.27, 28.99, 18.28, 42.06]
orange: 50.90% [-0.0, 35.93, 5.1, 47.92]
orange: 50.45% [5.18, 28.94, 18.26, 38.99]
person: 75.53% [11.45, 5.31, 30.49, 63.82]
vase: 53.16% [5.38, 8.69, 56.69, 60.76]
keyboard: 78.52% [0.03, 29.24, 64.03, 49.71]
keyboard: 55.41% [-0.0, 0.08, 64.0, 28.69]
person: 76.21% [0.07, 0.06, 61.3, 63.91]
sports ball: 62.50% [32.83, 25.87, 49.06, 50.14]
person: 78.03% [-0.03, 0.05, 63.97, 63.8]
person: 82.41% [17.1, 14.96, 33.42, 63.95]
person: 81.12% [8.82, 18.94, 21.39, 63.93]
person: 69.76% [0.03, 17.58, 9.82, 63.8]
person: 63.36% [53.1, 17.86, 64.0, 63.91]
person: 57.00% 

In [28]:

import cv2

#Open webcam
rtv = cv2.VideoCapture(0)

#Check if webcam is opened
if not rtv.isOpened():
    print("Error: Could not open webcam")
    exit()

#Read frames from webcam
while True:
    ret, frame = rtv.read()
        

    #Run inference with the model
    output = inference(frame, image_processor, model)

    #Converts the raw output into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format
    results = image_processor.post_process_object_detection(output, target_sizes=torch.tensor([(frame.shape[1], frame.shape[0])]), threshold=0.5)


    for result in results:
        #Unpack the each list simultaneously and pair each result with one another
        for score, label, box in zip(result['scores'], result['labels'], result['boxes']):
            #Convert each tensor value into a scalar
            score, label = score.item() * 100, label.item()
            box = [round(i, 2) for i in box.int().tolist()]
            #Print label: Confidence score, and Bounding Box Coordinates
            print(f"{model.config.id2label[label]}: {score:.2f} {box}")

            #Add Bounding Boxes to live video feed
            label = model.config.id2label[label]
            threshold = 0.5
    
            for i in range(len(result["boxes"])):
                #If the confidence score is above the threshold, draw the bounding box
                if score > threshold:
                    x1, y1, x2, y2 = box

                #Draw rectangles around objects
                cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
                cv2.putText(frame, label, (x1, y1 - 5),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
                
                #Display camera feed
                cv2.imshow('Webcam', frame)

    #Press 'q' to quit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

#Release webcam
#Always release the webcam after opening it
rtv.release()
cv2.destroyAllWindows()


person: 97.28 [0, 63, 480, 633]
sofa: 75.28 [0, 239, 307, 580]
person: 96.69 [0, 113, 480, 634]
sofa: 64.81 [0, 275, 281, 555]
