In [1]:
import os
# Allow duplicate OpenMP runtimes so torch won’t segfault
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
print("Set KMP_DUPLICATE_LIB_OK =", os.environ["KMP_DUPLICATE_LIB_OK"])


Set KMP_DUPLICATE_LIB_OK = TRUE


In [5]:
import os
import torch
from ultralytics import YOLO

FP32_CKPT  = 'outputs/cars_cosine_run2/weights/best.pt'
OUT_DIR    = 'quantized_model'
FP16_STATE = os.path.join(OUT_DIR, 'best_fp16.pth')

os.makedirs(OUT_DIR, exist_ok=True)

print(f"Loading FP32 checkpoint from {FP32_CKPT}...")
model = YOLO(FP32_CKPT)

print("Casting model to FP16...")
model.model.half()

print(f"Saving FP16 state_dict to {FP16_STATE}...")
torch.save(model.model.state_dict(), FP16_STATE)

print("✅ Finished.  You can now load with model.model.load_state_dict(...) and model.model.half().")


Loading FP32 checkpoint from outputs/cars_cosine_run2/weights/best.pt...
Casting model to FP16...
Saving FP16 state_dict to quantized_model\best_fp16.pth...
✅ Finished.  You can now load with model.model.load_state_dict(...) and model.model.half().


In [10]:
import os
import cv2
import time
import psutil
import torch
import numpy as np
from ultralytics import YOLO
import glob

# ─── CONFIG ─────────────────────────────────────────────────────────────────────────────
BASE_DIR        = os.getcwd()
VIDEO_IN        = os.path.join(BASE_DIR, 'dataset', 'test.mp4')
VIDEO_OUT       = os.path.join(BASE_DIR, 'outputs', 'detected_cars_fp16.mp4')
COMPILED_OUT    = os.path.join(BASE_DIR, 'outputs', 'compiled_video.mp4')
FRAMES_DIR      = os.path.join(BASE_DIR, 'FRAMES')
CHECKPT         = os.path.join(BASE_DIR, 'outputs', 'cars_cosine_run2', 'weights', 'best.pt')

os.makedirs(os.path.dirname(VIDEO_OUT), exist_ok=True)
os.makedirs(os.path.dirname(COMPILED_OUT), exist_ok=True)
os.makedirs(FRAMES_DIR, exist_ok=True)

# ─── 1) LOAD & CAST TO FP16 ─────────────────────────────────────────────────────────────
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Running on {device}")

yolo = YOLO(CHECKPT).to(device).half()
yolo.model.eval()

# ─── 2) VIDEO I/O ───────────────────────────────────────────────────────────────────────
cap = cv2.VideoCapture(VIDEO_IN)
if not cap.isOpened():
    raise FileNotFoundError(f"Cannot open {VIDEO_IN}")

fps = cap.get(cv2.CAP_PROP_FPS) or 20
w, h = 640, 320

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out    = cv2.VideoWriter(VIDEO_OUT, fourcc, fps, (w, h))

# ─── 3) ENTRY/EXIT LINES ─────────────────────────────────────────────────────────────────
line_y    = int(h * 0.75)
red_off   = 20
blue_off  = 0
red_line  = [(int(0.55*w), line_y + red_off), (int(0.95*w), line_y + red_off)]
blue_line = [(int(0.10*w), line_y + blue_off), (int(0.50*w), line_y + blue_off)]

# ─── 4) TRACKERS & COUNTERS ──────────────────────────────────────────────────────────────
entry_count   = 0
exit_count    = 0
next_e_id     = 0
next_x_id     = 0
e_tracker     = {}
x_tracker     = {}
prev_centers  = []

# ─── 5) MONITORING SETUP ─────────────────────────────────────────────────────────────────
proc        = psutil.Process(os.getpid())
torch.cuda.reset_peak_memory_stats()
start_time  = time.time()
frame_idx   = 0

# ─── 6) MAIN LOOP ─────────────────────────────────────────────────────────────────────────
while True:
    ret, frame0 = cap.read()
    if not ret:
        break

    frame = cv2.resize(frame0, (w, h))

    # inference (prints per-frame timing)
    results = yolo(frame, conf=0.3, device=device)[0]

    dets = []
    for box in results.boxes:
        if int(box.cls) != 0:
            continue
        x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
        cx, cy        = (x1 + x2)//2, (y1 + y2)//2
        prev          = next(
            (pc for pc in prev_centers
             if np.hypot(cx-pc[0], cy-pc[1]) < 50),
            None
        )
        dets.append({'box':(x1,y1,x2,y2),'curr':(cx,cy),'prev':prev})

    # entry logic
    for d in dets:
        cx, cy, pc = *d['curr'], d['prev']
        eid = next(
            (i for i, td in e_tracker.items()
             if np.hypot(cx-td['curr'][0], cy-td['curr'][1]) < 50),
            None
        )
        if eid is None and pc and pc[1] < red_line[0][1] <= cy:
            eid = next_e_id; next_e_id += 1
            e_tracker[eid] = {'prev':pc,'curr':(cx,cy),'counted':False}
        elif eid is not None:
            e_tracker[eid].update(prev=e_tracker[eid]['curr'], curr=(cx,cy))
        if eid is not None:
            td = e_tracker[eid]
            if not td['counted'] and td['prev'][1] < red_line[0][1] <= td['curr'][1]:
                entry_count += 1
                td['counted'] = True

    # exit logic
    for d in dets:
        cx, cy, pc = *d['curr'], d['prev']
        xid = next(
            (i for i, td in x_tracker.items()
             if np.hypot(cx-td['curr'][0], cy-td['curr'][1]) < 50),
            None
        )
        if xid is None and pc and pc[1] > blue_line[0][1] >= cy:
            xid = next_x_id; next_x_id += 1
            x_tracker[xid] = {'prev':pc,'curr':(cx,cy),'counted':False}
        elif xid is not None:
            x_tracker[xid].update(prev=x_tracker[xid]['curr'], curr=(cx,cy))
        if xid is not None:
            td = x_tracker[xid]
            if not td['counted'] and td['prev'][1] > blue_line[0][1] >= td['curr'][1]:
                exit_count += 1
                td['counted'] = True

    # draw
    for d in dets:
        x1,y1,x2,y2 = d['box']
        cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 1)
    cv2.line(frame, *red_line,  (0,0,255), 2)
    cv2.line(frame, *blue_line, (255,0,0), 2)
    cv2.putText(frame, f"Cars: {len(dets)}",         (10,30),
                cv2.FONT_HERSHEY_SIMPLEX,0.7,(255,255,255),2)
    cv2.putText(frame, f"Entry(red): {entry_count}", (10,60),
                cv2.FONT_HERSHEY_SIMPLEX,0.7,(0,0,255),2)
    cv2.putText(frame, f"Exit(blue): {exit_count}",  (10,90),
                cv2.FONT_HERSHEY_SIMPLEX,0.7,(255,0,0),2)

    # save frame & write to output
    cv2.imwrite(os.path.join(FRAMES_DIR, f'frame_{frame_idx:05d}.png'), frame)
    out.write(frame)

    prev_centers = [d['curr'] for d in dets]
    frame_idx   += 1

# ─── 7) CLEANUP & SUMMARY ───────────────────────────────────────────────────────────────
cap.release()
out.release()
total_time = time.time() - start_time
print(f"✅ Done: {frame_idx} frames in {total_time:.1f}s → {frame_idx/total_time:.1f} FPS")
print(f"Final counts → Entry: {entry_count}, Exit: {exit_count}")

# ─── 8) COMPILE FRAMES INTO VIDEO ─────────────────────────────────────────────────────────
frame_files = sorted(glob.glob(os.path.join(FRAMES_DIR, 'frame_*.png')))
fourcc_cmp  = cv2.VideoWriter_fourcc(*'mp4v')
writer_cmp  = cv2.VideoWriter(COMPILED_OUT, fourcc_cmp, fps, (w, h))
for f in frame_files:
    img = cv2.imread(f)
    writer_cmp.write(img)
writer_cmp.release()
print(f"🎥 Compiled video saved to: {COMPILED_OUT}")


Running on cuda

0: 320x640 1 item, 11.5ms
Speed: 2.1ms preprocess, 11.5ms inference, 1.2ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 item, 10.9ms
Speed: 1.2ms preprocess, 10.9ms inference, 2.0ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 (no detections), 5.1ms
Speed: 1.1ms preprocess, 5.1ms inference, 0.5ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 (no detections), 10.6ms
Speed: 1.1ms preprocess, 10.6ms inference, 0.7ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 item, 4.9ms
Speed: 1.3ms preprocess, 4.9ms inference, 1.1ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 item, 5.5ms
Speed: 1.1ms preprocess, 5.5ms inference, 1.2ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 item, 6.8ms
Speed: 1.1ms preprocess, 6.8ms inference, 5.6ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 (no detections), 11.0ms
Speed: 1.4ms preprocess, 11.0ms inference, 1.1ms postprocess per image