In [2]:
import os

from ultralytics import YOLO
import cv2


VIDEOS_DIR = os.path.join('.', 'videos')

video_path = os.path.join(VIDEOS_DIR, 'video.mp4')
video_path_out = '{}_out.mp4'.format('./video/')

cap = cv2.VideoCapture('./video/video.mp4')
ret, frame = cap.read()
H, W, _ = frame.shape
out = cv2.VideoWriter(video_path_out, cv2.VideoWriter_fourcc(*'MP4V'), int(cap.get(cv2.CAP_PROP_FPS)), (W, H))



# Load a model
model = YOLO('./runs/segment/train32/weights/best.pt')  # load a custom model

threshold = 0.5

while ret:

    results = model(frame)[0]

    for result in results.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = result

        if score > threshold:
            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 4)
            cv2.putText(frame, results.names[int(class_id)].upper(), (int(x1), int(y1 - 10)),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3, cv2.LINE_AA)

    out.write(frame)
    ret, frame = cap.read()

cap.release()
out.release()
cv2.destroyAllWindows()


0: 640x352 1 dog, 136.7ms
Speed: 6.0ms preprocess, 136.7ms inference, 320.9ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1 dog, 15.0ms
Speed: 2.0ms preprocess, 15.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1 cat, 1 dog, 12.1ms
Speed: 1.0ms preprocess, 12.1ms inference, 2.9ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1 dog, 7.0ms
Speed: 2.0ms preprocess, 7.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1 dog, 7.0ms
Speed: 1.3ms preprocess, 7.0ms inference, 2.5ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1 dog, 9.5ms
Speed: 2.0ms preprocess, 9.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1 dog, 7.0ms
Speed: 1.5ms preprocess, 7.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1 cat, 10.0ms
Speed: 1.0ms preprocess, 10.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 352)

0: 640x352 1

In [3]:
import cv2
from torchvision.transforms import functional as F

# Load your model
model = YOLO('./runs/segment/train32/weights/best.pt')  # load a custom model

# Open video capture
cap = cv2.VideoCapture('./video/video.mp4')  # Replace with your video file

# Set up video writer
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter('output_combined.avi', fourcc, 20.0, (640, 480))  # Adjust parameters as needed

# Get video frame dimensions
W = int(cap.get(3))
H = int(cap.get(4))

threshold_detection = 0.5
threshold_segmentation = 0.5

while True:
    ret, frame = cap.read()

    if not ret:
        break

    # Object detection
    results = model(frame)

    if results is not None and len(results) > 0:
        print("Results:", results)  # Print the entire results object for inspection
        for result in results.xyxy[0]:  # Assuming 'xyxy' contains bounding box information
            print("Result:", result)  # Print each individual result for inspection
            # Extract detection results
            box = result[:4].tolist()
            score = result[4].item()
            class_id = int(result[5].item())

            x1, y1, x2, y2 = map(int, box)

            if score > threshold_detection:
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 4)
                cv2.putText(frame, model.names[class_id].upper(), (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3, cv2.LINE_AA)

            # Semantic segmentation
            masks = results.pred[0]['masks']  # Adjust the key based on the actual structure
            if masks is not None:
                for i, mask in enumerate(masks):  # Use enumerate to get both index and value
                    mask_cpu = mask.cpu().numpy() * 255
                    mask_resized = cv2.resize(mask_cpu, (W, H))

                    # Apply threshold for segmentation
                    _, mask_thresholded = cv2.threshold(mask_resized, threshold_segmentation * 255, 255, cv2.THRESH_BINARY)

                    # Apply segmentation mask to the frame
                    segmented_frame = cv2.bitwise_and(frame, frame, mask=mask_thresholded.astype('uint8'))

                    # Save segmented frame (optional)
                    cv2.imwrite(f'./output_segmented_{i}.png', segmented_frame)

        # Write to the output video
        out.write(frame)

# Release resources
cap.release()
out.release()
cv2.destroyAllWindows()



0: 640x352 1 dog, 43.8ms
Speed: 3.0ms preprocess, 43.8ms inference, 2.9ms postprocess per image at shape (1, 3, 640, 352)


Results: [ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: ultralytics.engine.results.Masks object
names: {0: 'cat', 1: 'dog'}
orig_img: array([[[252, 241, 238],
        [252, 241, 238],
        [252, 241, 238],
        ...,
        [245, 237, 225],
        [245, 237, 225],
        [245, 237, 225]],

       [[252, 241, 238],
        [252, 241, 238],
        [252, 241, 238],
        ...,
        [245, 237, 225],
        [245, 237, 225],
        [245, 237, 225]],

       [[252, 241, 238],
        [252, 241, 238],
        [252, 241, 238],
        ...,
        [245, 237, 225],
        [245, 237, 225],
        [245, 237, 225]],

       ...,

       [[ 66, 101, 112],
        [ 66, 101, 112],
        [ 67, 102, 113],
        ...,
        [ 20,  62,  65],
        [ 18,  60,  63],
        [ 16,  58,  61]],

       [[ 55,  90, 101],
        [ 55,  90, 101],
        [ 56,  91, 102],
        ...,
        [ 22,  64,  6

AttributeError: 'list' object has no attribute 'xyxy'