In [2]:
import os
import cv2
from ultralytics import YOLO

In [3]:
rgb_source = "/home/hassaan/Downloads/rgb_split/"
depth_source = "/home/hassaan/Downloads/dataset/images/"
labels_destination = "/home/hassaan/Downloads/dataset/labels/"

In [4]:
rgb_list = os.listdir(rgb_source)
rgb_list.sort(key=lambda x: int(x.split(".")[0]))
rgb_list

['0.png', '1.png']

In [5]:
pose_model = YOLO("models/yolov8m-pose.pt")
object_model = YOLO("models/yolov8m.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8m-pose.pt to 'models/yolov8m-pose.pt'...


100%|██████████| 50.8M/50.8M [00:03<00:00, 16.7MB/s]


Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8m.pt to 'models/yolov8m.pt'...


100%|██████████| 49.7M/49.7M [00:01<00:00, 27.3MB/s]


In [6]:
def test_object_detection(frame, height, width):

    bbox_results = object_model(frame)
    bboxes = []
    # Render the results
    for result in bbox_results:
        if result.boxes is not None:
            for box in result.boxes:
                label = object_model.names[int(box.cls[0])]
                confidence = box.conf[0]
                if label == "person" and confidence > 0.4:
                    bbox = box.xyxy[0].numpy().astype(int)

                    bboxes.append(bbox)
                    x1, y1, x2, y2 = bbox
                    while False:
                        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                        cv2.putText(
                            frame,
                            f"{label} {confidence:.2f}",
                            (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            0.5,
                            (0, 255, 0),
                            2,
                        )
                    x_center, y_center, box_width, box_height = (
                        (x1 + x2) / 2,
                        (y1 + y2) / 2,
                        x1 + x2,
                        y1 + y2,
                    )
                    return [
                        x_center / width,
                        y_center / height,
                        box_width / width,
                        box_height / height,
                    ], frame

In [7]:
def human_annotations(path):
    frame = cv2.imread(path)
    height, width, _ = frame.shape

    bbox_results, frame = test_object_detection(frame, height, width)

    pose_results = pose_model.predict(frame)
    keypoints = pose_results[0].keypoints
    if len(keypoints) > 0:
        keypoints = keypoints[0].cpu().numpy()
    else:
        keypoints = None
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    annotations = [0] + bbox_results
    if keypoints is not None:
        xy = keypoints.data[0]
        conf = keypoints.conf[0]
        for i in range(len(xy)):
            x, y, conf = xy[i]
            if conf > 0:
                cv2.circle(frame, (int(x), int(y)), 2, (0, 255, 0), -1)
                cv2.putText(
                    frame,
                    f"{i}-{int(conf*100)}",
                    (int(x), int(y) - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (255, 0, 0),
                    1,
                )
            annotations += [x / width, y / height, conf]
    cv2.imwrite("./output.png", frame)

    return annotations

In [8]:
path = "/home/hassaan/Downloads/SCREENSHOTS/nightCrawler.jpg"
human_annotations(path)


0: 544x640 1 person, 1 tie, 2351.1ms
Speed: 9.2ms preprocess, 2351.1ms inference, 206.2ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 1 person, 2125.6ms
Speed: 4.5ms preprocess, 2125.6ms inference, 6.0ms postprocess per image at shape (1, 3, 544, 640)


[0,
 0.49362244897959184,
 0.5182926829268293,
 0.9872448979591837,
 1.0365853658536586,
 0.45090138182348133,
 0.3075299146698742,
 0.99155056,
 0.47692143187230945,
 0.24344851331013004,
 0.98072803,
 0.3964913815868144,
 0.26409858610571885,
 0.9802058,
 0.5214869440818319,
 0.2500436131547137,
 0.859938,
 0.3270313496492347,
 0.3048902604638076,
 0.88930154,
 0.6835650229940609,
 0.5639311627643865,
 0.98636,
 0.2443479226560009,
 0.6433977731844274,
 0.9712768,
 0.0,
 0.0,
 0.19218221,
 0.0,
 0.0,
 0.108061604,
 0.0,
 0.0,
 0.06869289,
 0.0,
 0.0,
 0.04261578,
 0.0,
 0.0,
 0.0082016215,
 0.0,
 0.0,
 0.006917274,
 0.0,
 0.0,
 0.0019635519,
 0.0,
 0.0,
 0.0016172073,
 0.0,
 0.0,
 0.00062964304,
 0.0,
 0.0,
 0.0005830445]

In [10]:
for image in rgb_list:
    path = rgb_source + image
    number = int(image.split(".")[0])

    text = human_annotations(path)
    text = str(text).lstrip("[").rstrip("]").replace(",", "")

    with open(
        f"{labels_destination}/{number}.txt",
        "w+",
    ) as file:
        file.write(text)
        print(f"saved file : {labels_destination}{number}.txt")


0: 640x416 1 person, 1 potted plant, 1754.1ms
Speed: 3.4ms preprocess, 1754.1ms inference, 23.9ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1626.4ms
Speed: 7.7ms preprocess, 1626.4ms inference, 2.4ms postprocess per image at shape (1, 3, 640, 416)
saved file : /home/hassaan/Downloads/dataset/labels/0.txt

0: 576x640 1 person, 2 handbags, 3 chairs, 2011.6ms
Speed: 5.6ms preprocess, 2011.6ms inference, 2.0ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 2034.1ms
Speed: 7.0ms preprocess, 2034.1ms inference, 2.6ms postprocess per image at shape (1, 3, 576, 640)
saved file : /home/hassaan/Downloads/dataset/labels/1.txt


'0 0.09795673076923077 0.4361111111111111 0.19591346153846154 0.8722222222222222 0.0 0.0 0.42677712 0.0 0.0 0.1960839 0.0 0.0 0.27187887 0.0 0.0 0.4001799 0.07273994500820453 0.21386241912841797 0.6183357 0.03756919961709242 0.28499234517415367 0.91871655 0.1186533707838792 0.2850828594631619 0.9771234 0.0034462629029384027 0.39101732042100695 0.670641 0.1671757147862361 0.3979143778483073 0.9120515 0.0 0.0 0.4546735 0.1345757246017456 0.37044330173068574 0.7553778 0.07511217319048367 0.45458780924479164 0.9609252 0.13105473151573768 0.45579274495442706 0.9788629 0.10324607445643498 0.5775990804036458 0.9355841 0.16154109514676607 0.5598100026448568 0.96521175 0.11838032649113582 0.682875484890408 0.79988086 0.18702829801119292 0.6609213087293837 0.8649284'

51