In [6]:

import torch
import cv2
import numpy as np
import random

from ultralytics import YOLO

from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor

from sort.sort import *


In [7]:

yolo_detector = YOLO('yolov8s.pt')

checkpoint = "./sam2_model/sam2.1_hiera_small.pt"
model_cfg = "C:/Users/Gerardo/Documents/SAM2/sam2_model/sam2.1_hiera_s.yaml"
sam_segment = SAM2ImagePredictor(build_sam2(model_cfg, checkpoint))

mot_tracker = Sort()


In [8]:


video = cv2.VideoCapture('./video/cars.mp4')
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
#print(int(video.get(cv2.CAP_PROP_FRAME_COUNT)))


fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Specify the codec
fps = video.get(cv2.CAP_PROP_FPS)
width = 20+2*int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter('./video/cars_processed.mp4', fourcc, fps, (width, height))


vehicle_ids = [2, 3, 5, 7]

colors = {}
vehicle_type = {}
frame_num = -1
ret = True
alpha = 0.5


while ret:
    frame_num += 1
    print(frame_num)
    ret, frame = video.read()

    if not ret:
        break
    
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    vehicles = yolo_detector(frame_rgb,device=0)[0]

    vehicle_boxes = []
    for vehicle in vehicles.boxes.data.tolist():
        x1, y1, x2, y2, score, class_id = vehicle
        x1 = int(x1)
        y1 = int(y1)
        x2 = int(x2)
        y2 = int(y2)
        class_id = int(class_id)
        
        if class_id in vehicle_ids:
            vehicle_boxes.append([x1, y1, x2, y2, score])
        
    
    track_ids = mot_tracker.update(np.asarray(vehicle_boxes))

    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
        sam_segment.set_image(frame_rgb)
        
        for track_id in track_ids:
            x1, y1, x2, y2, id_num = track_id
            x1 = int(x1)
            y1 = int(y1)
            x2 = int(x2)
            y2 = int(y2)
            id_num = int(id_num)
            
            colored_mask = np.zeros_like(frame_rgb)

            if id_num not in colors.keys():
                colors[id_num] = {"r" : random.randint(0, 255), "g" : random.randint(0, 255), "b" : random.randint(0, 255)}
                
                for vehicle in vehicles.boxes.data.tolist():
                    cx1, cy1, cx2, cy2, cscore, cid = vehicle
                    cx1 = int(cx1)
                    cy1 = int(cy1)
                    cx2 = int(cx2)
                    cy2 = int(cy2)
                    cid = int(cid)
                    
                    if cx1 == x1 and cy1 == y1 and cx2 == x2 and cy2 == y2 and id_num not in vehicle_type: 
                        if cid == 2:
                            vehicle_type[id_num] = "car"
                        if cid == 3:
                            vehicle_type[id_num] = "bike"
                        if cid == 5:
                            vehicle_type[id_num] = "bus"
                        if cid == 7:
                            vehicle_type[id_num] = "truck"
            
            mask_color = [colors[id_num]["r"], colors[id_num]["g"], colors[id_num]["b"]]
            
            input_box = np.array([x1, y1, x2, y2])
            mask, _, _ = sam_segment.predict(box=input_box, multimask_output=False)

            xmin = 10000
            ymin = 10000
            xmax = -1
            ymax = -1

            for i in range(mask.shape[1]):
                for j in range(mask.shape[2]):
                    if(int(mask[0,i,j]) == 1):
                        colored_mask[i,j] = mask_color
                        if i < ymin:
                            ymin = i
                        if j < xmin:
                            xmin = j
                        if i > ymax:
                            ymax = i
                        if j > xmax:
                            xmax = j
           
            
            frame_rgb = cv2.addWeighted(frame_rgb, 1, colored_mask, alpha, 0)

            if (ymin - 25) >= 0 and (xmin + 110) < width:
                cv2.rectangle(frame_rgb, (xmin, ymin - 25), (xmin + 110, ymin), (255,255,255), thickness = -1)
                cv2.putText(frame_rgb, f"{id_num} - {vehicle_type[id_num]}", (xmin,ymin-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0),2)
            
        overlay_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)

        band = np.full((height, 20, 3), 128, dtype = np.uint8)

        final_image = np.hstack((frame,band,overlay_bgr))

        #cv2.imwrite("conc.jpg", final_image)
        #cv2.imwrite("processed_frame.jpg", overlay_bgr)
        out.write(final_image)
        
out.release()
video.release()

print("Complete")


0

0: 384x640 1 person, 14 cars, 1 bus, 2 trucks, 59.8ms
Speed: 4.3ms preprocess, 59.8ms inference, 81.2ms postprocess per image at shape (1, 3, 384, 640)
Complete
