In [1]:
import cv2
import os   # build in python lib to interact with the operating system

In [2]:
def extract_frames(video_path,output_folder, frame_rate = 1):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    cap = cv2.VideoCapture(video_path) # loads your video into opencv
    
    video_fps = int(cap.get(cv2.CAP_PROP_FPS)) # gets the video frame rate
    print(f"Video FPS: {video_fps}")
    
    frame_interval = int(video_fps / frame_rate) # tells us how often to save a frame
    frame_count = 0  # total number of frames processed
    saved_count = 0  # number of frames actually saved as images
    
    while cap.isOpened(): # checks that the video file is not closed or corrupted
        ret, frame = cap.read() # cap.read() returns two things (ret is a boolean for whether the frame was read successfully or not, frame is the actual image data in numpy array form)
        if not ret:   # if frame is not ready successfuly (frames end)
            break
            
        if frame_count % frame_interval == 0 :  # saves only every nth frame 
            filename = os.path.join(output_folder, f"frame_{saved_count:04d}.jpg") # frames are saved as frame_0001.jpg,frame_0002,jpg and so on
            cv2.imwrite(filename, frame)
            saved_count += 1
            
        frame_count +=1
        
    cap.release()   # closes the video file
    print(f"Total frames saved: {saved_count}")

In [3]:
video_path = "2025-05-27_13-46-16_UTC.mp4"
output_folder = "VIDEO FRAMES"
extract_frames(video_path, output_folder, frame_rate = 1)

Video FPS: 30
Total frames saved: 15


In [4]:
from ultralytics import YOLO
import torch
from torch.serialization import add_safe_globals
from ultralytics.nn.tasks import DetectionModel
import json # JSON (Javascript Object Notation) is a standard format to store structured data
# json module used to save python data to a .json file(dump) and vice versa(load)

add_safe_globals([DetectionModel])

frames_path = "VIDEO FRAMES" # folder where extracted frames are stored
output_json = 'detections.json' # this is a filename string will be created later

model = YOLO("yolov8n.pt")

Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\hp\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [12]:
from transformers import YolosImageProcessor, YolosForObjectDetection
from PIL import Image
import torch
import os
import json

# Load model and processor
processor = YolosImageProcessor.from_pretrained("valentinafeve/yolos-fashionpedia")
model = YolosForObjectDetection.from_pretrained(
    "valentinafeve/yolos-fashionpedia",
    trust_remote_code=True,
    use_safetensors=True
)

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [13]:
detections = [] # will store all frame wise detections

frame_files = sorted(os.listdir(frames_path)) # sort frames numerically (for ease)

for i, file in enumerate(frame_files): # enumerate gives you the index(0,1,2) and the filename(frame_0001.jpg)
    frame_number = i
    img_path = os.path.join(frames_path, file) # to build the full path of a file (joins the folder and the file)
    
    image = Image.open(img_path)
    inputs = processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    target_sizes = torch.tensor([image.size[::-1]])  # (H, W)
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.5)[0]
    
    
    # model(img_path) runs inference on the image
   # results = model(img_path)[0] # yolov8 returns multiple things out of which the first will be results.boxes(list of all detections with class, box coordinates and confidence)
    
    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        xmin, ymin, xmax, ymax = box.tolist()
        w = xmax - xmin
        h = ymax - ymin
        
        detections.append({
            'frame' : frame_number,
            'class' : model.config.id2label[label.item()], #class_name
            'bbox' : [int(xmin), int(ymin), int(w), int(h)] , #[x1, y1, w, h]
            'confidence' :  round(score.item(), 3) # conf
        }) 
    
    # each box inside result.boxes contains:
    # box.xyxy -> [x1,y1,x2,y2]
    # box.conf -> confidence
    # box.cls -> class ID
"""for box in results.boxes: # result.boxes is a list of all detected objects in the current frame
        cls_id = int(box.cls[0])  # get class ID
        class_name = model.names[cls_id] # map ID to name using YOLOv8's build in list
        conf = float(box.conf[0]) # confidence score (how sure YOLO is)
        x1, y1, x2, y2 = map(int, box.xyxy[0]) # get top left and bottom right of bounding box 
    
        w = x2 - x1
        h = y2 - y1"""


    
        

"for box in results.boxes: # result.boxes is a list of all detected objects in the current frame\n        cls_id = int(box.cls[0])  # get class ID\n        class_name = model.names[cls_id] # map ID to name using YOLOv8's build in list\n        conf = float(box.conf[0]) # confidence score (how sure YOLO is)\n        x1, y1, x2, y2 = map(int, box.xyxy[0]) # get top left and bottom right of bounding box \n    \n        w = x2 - x1\n        h = y2 - y1"

In [14]:
with open(output_json,'w') as f:  # this is a python context manager
    json.dump(detections, f, indent = 2)
    
# opens the file in write mode 'w' -> (creates file if not exist, overwrites if exists)
# other modes -> 'r' -> read (error if file doesnt exist)
# 'a' -> append(creates if not exist,appends if exists)
# 'x' -> create(error if already exists)
# output_json is taken as f