In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
import cv2
from PIL import Image
from ultralytics import YOLO

# Define the input and output directories
inpDir = ''  # location where input data is stored
outDir = ''  # location to store outputs
subDir = ''  # location of the images
modelDir = ''
altName = 'yolo'

RANDOM_STATE = 24  # for initialization ----- REMEMBER: to remove at the time of promotion to production
EPOCHS = 100  # number of cycles to run
THRESHOLD = 0.6  # Updated threshold for object detection

# Set parameters for decoration of plots
params = {'legend.fontsize': 'large',
          'figure.figsize': (15, 12),
          'axes.labelsize': 'x-large',
          'axes.titlesize': 'x-large',
          'xtick.labelsize': 'large',
          'ytick.labelsize': 'large',
          }

CMAP = plt.cm.brg

plt.rcParams.update(params)  # update rcParams

# Helper Functions
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Create different colors for each class.
COLORS = np.random.uniform(0, 255, size=(len(COCO_INSTANCE_CATEGORY_NAMES), 3))

# Load YOLOv8s model (small variant for better accuracy)
def get_yolo_model(device='cpu'):
    model = YOLO('yolov8s.pt')  # Load YOLOv8 Small model (for better accuracy)
    model.eval().to(device)
    return model

# YOLOv8 Object Detection Prediction
def predict_yolo(image, model, device, threshold=0.5):
    results = model(image)  # Run inference
    result = results[0]  # Access the first element (the result for the first image)

    # Access bounding boxes from the result
    boxes = result.boxes  # This is a 'Boxes' object
    pred_boxes = boxes.xywh.cpu().numpy()  # Get [x_center, y_center, width, height]

    # Confidence scores
    pred_conf = boxes.conf.cpu().numpy()  # Confidence scores

    # Class labels (numeric labels)
    pred_labels = boxes.cls.cpu().numpy().astype(int)  # Class labels, ensure they're integers

    # Get class names
    pred_classes = result.names  # Class names dictionary

    # Filter out predictions below the threshold
    valid_idx = pred_conf >= threshold
    boxes_filtered = pred_boxes[valid_idx].astype(np.int32)
    labels_filtered = pred_labels[valid_idx]

    return boxes_filtered, [pred_classes[int(label)] for label in labels_filtered], labels_filtered


# Draw bounding boxes on the image
def draw_boxes_yolo(boxes, classes, labels, image):
    lw = max(round(sum(image.shape) / 2 * 0.001), 2)  # Line width for boxes
    tf = max(lw - 1, 2)  # Text thickness
    
    for i, box in enumerate(boxes):
        color = COLORS[int(labels[i])]  # Convert label to integer for indexing COLORS
        
        # Draw bounding box
        cv2.rectangle(
            img=image,
            pt1=(int(box[0] - box[2] / 2), int(box[1] - box[3] / 2)),
            pt2=(int(box[0] + box[2] / 2), int(box[1] + box[3] / 2)),
            color=color[::-1],  # Convert RGB to BGR for OpenCV
            thickness=lw
        )
        
        # Add class label
        cv2.putText(
            img=image,
            text=classes[i],
            org=(int(box[0] - box[2] / 2), int(box[1] - box[3] / 2) - 5),
            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
            fontScale=lw / 2,
            color=color[::-1],
            thickness=tf,
            lineType=cv2.LINE_AA
        )
    
    return image

# Load the video and process it
def process_video_yolo(video_path, model, device, threshold=0.5):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print('Error while trying to read video. Please check path again')
        return
    
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    save_name = f"{video_path.split('/')[-1].split('.')[0]}_yolo_{''.join(str(threshold).split('.'))}"
    out = cv2.VideoWriter(os.path.join(outDir, f"{save_name}.mp4"),
                          cv2.VideoWriter_fourcc(*'mp4v'), 30,
                          (frame_width, frame_height))
    
    frame_count = 0
    total_fps = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if ret:
            frame_copy = frame.copy()
            frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
            
            # Resize the frame to improve detection
            frame_resized = cv2.resize(frame_copy, (1280, 720))  # Use higher resolution
            
            # Start timer
            start_time = time.time()
            
            # Get YOLO predictions
            boxes, classes, labels = predict_yolo(frame_resized, model, device, threshold)
            
            # Scale the boxes back to the original frame dimensions
            scale_x = frame_width / 1280
            scale_y = frame_height / 720
            boxes = boxes * [scale_x, scale_y, scale_x, scale_y]  # Rescale bounding boxes

            # Draw boxes on the original frame
            frame_copy = draw_boxes_yolo(boxes, classes, labels, frame_copy)
            
            # Calculate FPS
            end_time = time.time()
            fps = 1 / (end_time - start_time)
            total_fps += fps
            frame_count += 1
            
            # Add FPS on frame
            cv2.putText(
                img=frame_copy,
                text=f"{fps:.3f} FPS",
                org=(15, 30),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=1,
                color=(0, 255, 0),
                thickness=2,
                lineType=cv2.LINE_AA
            )
            
            # Write processed frame to video output
            out.write(frame_copy)
            
            # Display the frame
            cv2.imshow('YOLO Video', frame_copy)
            
            # Exit if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        else:
            break
    
    cap.release()
    out.release()
    cv2.destroyAllWindows()
    
    # Print average FPS
    avg_fps = total_fps / frame_count
    print(f"Average FPS: {avg_fps:.3f}")

# Main function to run the video processing
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using {device} device for inference")
    
    # Load YOLOv8s model
    model = get_yolo_model(device)
    
    # Process video
    vidName = 'VID_20240320164919_F (online-video-cutter.com).mp4'  # Specify your video file path here
    vidFilePath = os.path.join(inpDir, subDir, vidName)
    process_video_yolo(vidFilePath, model, device, threshold=THRESHOLD)

if __name__ == "__main__":
    main()


Using cuda device for inference

0: 384x640 15 persons, 1 motorcycle, 2 trucks, 61.8ms
Speed: 2.8ms preprocess, 61.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 4 trucks, 19.3ms
Speed: 0.0ms preprocess, 19.3ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 trucks, 12.1ms
Speed: 1.2ms preprocess, 12.1ms inference, 5.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 3 trucks, 14.8ms
Speed: 2.2ms preprocess, 14.8ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 2 trucks, 1 umbrella, 7.4ms
Speed: 1.0ms preprocess, 7.4ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 2 trucks, 14.0ms
Speed: 0.0ms preprocess, 14.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 car, 3 trucks, 11.1ms
Speed: 2.3ms preprocess, 11.1ms inference, 2.0ms postprocess per ima