# YOLO-World frame inference
Notebook walkthrough for running `predict_yoloworld.py` on extracted frames with timing CSV.
Set the config cell, then run top-to-bottom.


In [None]:
import os
import json
import random
import time
import logging
from pathlib import Path

import cv2
import numpy as np
from tqdm import tqdm
from ultralytics import YOLOWorld

from predict_yoloworld import (
    YoloWorldConfig,
    YoloWorldPredictor,
    list_video_dirs,
    process_video,
    setup_logger,
)


## Extract data

In [None]:
!bash /code/extract_frame.sh

## Set seed

In [None]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()


## Configure paths and thresholds

In [None]:
WEIGHTS = '/code/checkpoint/yoloworld_no_bg_v8s_20eps_bs32_exp2/weights/best.pt'
FRAMES_ROOT = Path('/data/extracted_frames')
OUT_DIR = Path('/result')
IMG_SIZE = 640
CONF = 0.001
IOU = 0.7
FILTER_BOX = 0.015
DEVICE = '0'  # set to 'cpu' to disable GPU
SAVE_VIS = False
USE_TRACKING = False
TRACK_ALPHA = 0.6
TRACK_MAX_AGE = 5
TRACK_CONF_DECAY = 0.90

OUT_DIR.mkdir(parents=True, exist_ok=True)
VIS_ROOT = OUT_DIR / 'visualize' if SAVE_VIS else None
if VIS_ROOT:
    VIS_ROOT.mkdir(parents=True, exist_ok=True)

logger = setup_logger()
logger.info('weights: %s', WEIGHTS)
logger.info('frames_root: %s', FRAMES_ROOT)

cfg = YoloWorldConfig(
    weights=WEIGHTS,
    imgsz=IMG_SIZE,
    conf=CONF,
    iou=IOU,
    filter_box=FILTER_BOX,
    device=DEVICE,
    save_vis=SAVE_VIS,
    use_tracking=USE_TRACKING,
    track_alpha=TRACK_ALPHA,
    track_max_age=TRACK_MAX_AGE,
    track_conf_decay=TRACK_CONF_DECAY,
)
predictor = YoloWorldPredictor(cfg, logger)


## Inspect data

In [None]:
videos = list_video_dirs(FRAMES_ROOT)
print(f'Found {len(videos)} video folders under {FRAMES_ROOT}')
if videos:
    print('Sample IDs:', [v.name for v in videos[:5]])


## Run inference with timing

In [None]:
submission = []
all_predicted_time = []

for vdir in tqdm(videos, desc='Processing videos'):
    t1 = time.time()
    result = process_video(vdir, predictor, VIS_ROOT, logger)
    t2 = time.time()
    predicted_time = int(t2*1000 - t1*1000)
    all_predicted_time.append((vdir.name, predicted_time))
    submission.append(result)

print(f'Completed {len(submission)} items')
print('Timing sample:', all_predicted_time[:3])


## Save submission and timing CSV

In [None]:
out_json = OUT_DIR / 'jupyter_submission.json'
with out_json.open('w', encoding='utf-8') as f:
    json.dump(submission, f, ensure_ascii=False, indent=2)

time_csv = OUT_DIR / 'time_submission.csv'
with time_csv.open('w', encoding='utf-8') as f:
    f.write('id,answer,time')
    for vid, t_ms in all_predicted_time:
        answer = json.dumps(next((s['detections'] for s in submission if s['video_id']==vid), []))
        f.write(f"{vid},{answer},{t_ms}")

print(f'Wrote results → {out_json}')
print(f'Wrote timing → {time_csv}')
if submission:
    print('Preview:', json.dumps(submission[0], ensure_ascii=False, indent=2)[:500])


## (Optional) Inspect a visualization frame

In [None]:
if SAVE_VIS and VIS_ROOT:
    sample = next(VIS_ROOT.glob('*/*.jpg'), None)
    if sample:
        print(f'Sample visualization: {sample}')
    else:
        print('No visualization files written yet.')
else:
    print('Set SAVE_VIS=True in the config cell to export visualization frames.')
