In [1]:
pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.59-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.59-py3-none-any.whl (906 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.8/906.8 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.59 ultralytics-thop-2.0.13


In [2]:
import cv2
import numpy as np
import json
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [3]:
model = YOLO('yolov8n.pt')

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 187MB/s]


In [4]:
object_id_counter = 1
subobject_id_counter = 1

In [5]:
def detect_objects(frame):
    global object_id_counter
    results = model(frame)
    detections = results[0].boxes.data.cpu().numpy()

    object_list = []

    for detection in detections:
        x1, y1, x2, y2, confidence, class_id = detection
        class_name = model.names[int(class_id)]

        sub_objects = []
        if class_name == "person" or class_name == "car":
            sub_objects = detect_sub_objects(frame, [int(x1), int(y1), int(x2), int(y2)])

        object_data = {
            "object": class_name,
            "id": object_id_counter,
            "bbox": [int(x1), int(y1), int(x2), int(y2)],
            "subobject": sub_objects if sub_objects else None,
        }

        object_list.append(object_data)
        object_id_counter += 1

    return object_list

In [6]:
def detect_sub_objects(frame, parent_bbox):
    global subobject_id_counter
    x1, y1, x2, y2 = parent_bbox
    cropped_frame = frame[y1:y2, x1:x2]
    sub_results = model(cropped_frame)
    sub_detections = sub_results[0].boxes.data.cpu().numpy()

    sub_object_list = []

    for sub_detection in sub_detections:
        sx1, sy1, sx2, sy2, s_confidence, s_class_id = sub_detection
        sub_class_name = model.names[int(s_class_id)]

        sub_object_data = {
            "object": sub_class_name,
            "id": subobject_id_counter,
            "bbox": [int(sx1 + x1), int(sy1 + y1), int(sx2 + x1), int(sy2 + y1)],
        }
        sub_object_list.append(sub_object_data)
        subobject_id_counter += 1

    return sub_object_list

In [7]:
def process_video(video_path, output_json_path, output_video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Unable to open video file {video_path}")
        return

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    output_data = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        detections = detect_objects(frame)
        output_data.extend(detections)

        for obj in detections:
            x1, y1, x2, y2 = obj["bbox"]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, obj["object"], (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

            if obj["subobject"]:
                for sub_obj in obj["subobject"]:
                    sx1, sy1, sx2, sy2 = sub_obj["bbox"]
                    cv2.rectangle(frame, (sx1, sy1), (sx2, sy2), (255, 0, 0), 2)
                    cv2.putText(frame, sub_obj["object"], (sx1, sy1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        out.write(frame)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    with open(output_json_path, 'w') as json_file:
        json.dump(output_data, json_file, indent=4)
    print(f"Output saved to {output_json_path} and {output_video_path}")

In [8]:
if __name__ == "__main__":
    video_path = "/content/newtest.mp4"
    output_json_path = "output.json"
    output_video_path = "output_video.mp4"
    process_video(video_path, output_json_path, output_video_path)


0: 384x640 5 persons, 1 chair, 1 potted plant, 1 tv, 1 clock, 336.1ms
Speed: 11.9ms preprocess, 336.1ms inference, 43.7ms postprocess per image at shape (1, 3, 384, 640)

0: 640x320 1 person, 151.5ms
Speed: 11.7ms preprocess, 151.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 320)

0: 640x480 2 persons, 221.6ms
Speed: 4.1ms preprocess, 221.6ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 480)

0: 640x352 1 person, 240.7ms
Speed: 3.0ms preprocess, 240.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 352)

0: 640x256 2 persons, 179.2ms
Speed: 1.8ms preprocess, 179.2ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 256)

0: 640x480 (no detections), 296.0ms
Speed: 3.0ms preprocess, 296.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 480)

0: 384x640 4 persons, 1 chair, 1 potted plant, 1 tv, 1 clock, 244.7ms
Speed: 5.2ms preprocess, 244.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 640x32