In [None]:
import torch
import requests
from pathlib import Path
from PIL import Image

#Import RTDETR
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor

token = "Insert Hugging face token here"

#Load pretrained image processor and model
image_processor = RTDetrImageProcessor.from_pretrained(
    "PekingU/rtdetr_v2_r50vd",
    token=token
)

model = RTDetrV2ForObjectDetection.from_pretrained(
    "PekingU/rtdetr_v2_r50vd",
    token=token
)

In [22]:
#Find image and load image an an input to our image processor
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

inputs = image_processor(images=image, return_tensors='pt')


In [11]:
#Import/Get dataset to run inference on (Must be PIL Images,Tensors, or Arrays)
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.webp'}

#Get images from a image folder
def get_images_from_folder(folder_path):
    folder = Path(folder_path)
    
    if not folder.exists():
        raise FileNotFoundError(f"Folder not found: {folder_path}")
    
    # Collect all image files
    image_files = []
    for file_path in folder.iterdir():
        if file_path.suffix.lower() in IMAGE_EXTENSIONS:
            image_files.append(file_path)
    
    return sorted(image_files)  # Sort for consistent order

folder_paths = get_images_from_folder(r"C:\Users\Bo_jr\Documents\Datasets\SampleDataset")

for path in folder_paths:
    image = Image.open(path).convert('RGB')
    


In [12]:
#Inference function that takes in an image, image processor, and model and returns the results
def inference(imgs, img_proc, model):
    test_imgs = imgs
    #model.eval()
    with torch.no_grad():
        #The image processor packs the images into a dictionary
        trans_img = img_proc(images=imgs, return_tensors='pt')
        
        #The images can be unpacked via the ** operator
        results = model(**trans_img)

        return results
       


In [None]:
#Get images from sample dataset and run inference
outputs = []

folder_paths = get_images_from_folder(r"C:\Users\Bo_jr\Documents\Datasets\SampleDataset")
for path in folder_paths:
    image = Image.open(path).convert('RGB')
    output = inference(image, image_processor, model)
    #Converts the raw output into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) and append the output to the outputs list
    outputs.append(image_processor.post_process_object_detection(output, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.5))

#Unpack the outputs and print the results
for result in outputs:
    for res in result:
        #Simultaneously stores each detection result into their respective variables
        for score, label, box in zip(res["scores"], res["labels"], res["boxes"]):
            if box.numel() != 0:
                #Convert the tensor values to scalars
                score, label = score.item(), label.item()
                #Convert box values to floats
                box = [round(i, 2) for i in box.tolist()]
                print(f"{model.config.id2label[label]}: {score:.2f} {box}")
        
   

vase: 0.64 [15.18, 0.61, 40.28, 63.5]
broccoli: 0.70 [0.02, 10.58, 64.02, 56.08]
diningtable: 0.52 [-0.0, 0.1, 64.0, 63.71]
person: 0.64 [8.69, 16.39, 27.23, 47.81]
orange: 0.63 [6.22, 36.39, 24.7, 49.43]
apple: 0.61 [31.09, 48.29, 45.0, 61.66]
orange: 0.57 [22.42, 31.72, 33.13, 42.16]
orange: 0.55 [-0.0, 20.42, 33.34, 52.17]
orange: 0.54 [0.01, 22.79, 9.07, 36.14]
orange: 0.52 [4.27, 28.99, 18.28, 42.06]
orange: 0.51 [-0.0, 35.93, 5.1, 47.92]
orange: 0.50 [5.18, 28.94, 18.26, 38.99]
person: 0.76 [11.45, 5.31, 30.49, 63.82]
vase: 0.53 [5.38, 8.69, 56.69, 60.76]
keyboard: 0.79 [0.03, 29.24, 64.03, 49.71]
keyboard: 0.55 [-0.0, 0.08, 64.0, 28.69]
person: 0.76 [0.07, 0.06, 61.3, 63.91]
sports ball: 0.63 [32.83, 25.87, 49.06, 50.14]
person: 0.78 [-0.03, 0.05, 63.97, 63.8]
person: 0.82 [17.1, 14.96, 33.42, 63.95]
person: 0.81 [8.82, 18.94, 21.39, 63.93]
person: 0.70 [0.03, 17.58, 9.82, 63.8]
person: 0.63 [53.1, 17.86, 64.0, 63.91]
person: 0.57 [35.0, 14.76, 53.67, 63.79]
person: 0.93 [0.11, 

In [23]:
#Run inference with the model and collect the results
from typing import Any


with torch.no_grad():
    outputs = model(**inputs)

#Converts the raw output into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format
results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.5)



for result in results:
    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
        score, label = score.item(), label_id.item()
        box = [round(i, 2) for i in box.tolist()]
        print(f"{model.config.id2label[label]}: {score:.2f} {box}")


cat: 0.96 [13.71, 54.12, 317.53, 472.65]
cat: 0.95 [343.73, 23.68, 640.28, 373.05]
sofa: 0.94 [0.2, 1.32, 640.17, 474.38]
remote: 0.93 [40.6, 73.21, 175.74, 118.33]
remote: 0.89 [333.51, 76.79, 370.17, 188.13]


In [26]:
#Draw Bounding Boxes using OpenCV
import cv2
import urllib.request
import numpy as np

#Read img Url
urlresp = urllib.request.urlopen(url)
image_data = np.asarray(bytearray(urlresp.read()), dtype='uint8')
img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)

boxes = result['boxes']
scores = result['scores']

classes = ['Cat', 'Remote', 'Sofa']

scores = scores.softmax(-1)
conf, cls = scores.max(-1)

threshold = 0.5

for i in range(len(boxes)):
    if conf.item() > threshold:
        box = boxes[0]

    box = boxes[i]
    x1, y1, x2, y2 = box.int().tolist()

    label = f"{classes[cls.item()]}: {conf.item():.2f}"

    #Draw rectangles around objects
    cv2.rectangle(img, (x1,y1), (x2,y2), (0,255,0), 2)
    cv2.putText(img, label, (x1, y1 - 5),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
    
cv2.imwrite('prediction_results.jpg', img)
print('Results have been saved')


Results have been saved


In [None]:
#Open webcam
rtv = cv2.VideoCapture(0)

#Check if webcam is opened
if not rtv.isOpened():
    print("Error: Could not open webcam")
    exit()

#Read frames from webcam
while True:
    ret, frame = rtv.read()
    
    #Display frames
    cv2.imshow('Webcam', frame)

    #Run inference with the model
    output = inference(frame, image_processor, model)

    #Converts the raw output into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format
    results = image_processor.post_process_object_detection(output, target_sizes=torch.tensor([(frame.shape[1], frame.shape[0])]), threshold=0.5)


    for result in results:
        #Unpack the each list simultaneously and pair each result with one another
        for score, label, box in zip(result['scores'], result['labels'], result['boxes']):
            #Convert each tensor value into a scalar
            score, label = score.item(), label.item()
            


    #Press 'q' to quit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

#Release webcam
#Always release the webcam after opening it
rtv.release()
cv2.destroyAllWindows()


0.9545465111732483 0
0.7257509231567383 71
0.9597740173339844 0
0.7301617860794067 71
0.5973457098007202 39
0.5579811930656433 27
0.5157375335693359 56
0.9659205079078674 0
0.6817017793655396 71
0.5172863006591797 79
0.9665103554725647 0
0.6910728216171265 71
0.6284246444702148 56
0.96346515417099 0
0.6163796782493591 71
0.5151272416114807 39
0.9647098183631897 0
0.7187235951423645 71
0.9568052291870117 0
0.5888808369636536 39
0.5303278565406799 71


In [8]:
'''
#Function that returns the box coordinates and the scores from the results
def get_boxes_n_scores(results):
    for result in results:
        boxes = result['boxes']
        boxes.int()
        scores = result['scores']
        return boxes, scores

get_boxes_n_scores(results)
#Convert boxes and scores to int values
'''

"\n#Function that returns the box coordinates and the scores from the results\ndef get_boxes_n_scores(results):\n    for result in results:\n        boxes = result['boxes']\n        boxes.int()\n        scores = result['scores']\n        return boxes, scores\n\nget_boxes_n_scores(results)\n#Convert boxes and scores to int values\n"

In [None]:
   #Working example code
   
    for result in results:
        for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
            if box.numel() != 0:
                score, label = score.item(), label.item()
                box = [round(i, 2) for i in box.tolist()]
                print(f"{model.config.id2label[label]}: {score:.2f} {box}")
    
    #Draw bounding boxes
    for result in results:
        for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
            if box.numel() != 0:
                box = con2xy(box)
                x1, y1, x2, y2 = box.int().tolist()
                cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)