In [None]:
# Video Understanding Example

This notebook demonstrates the process of extracting frames from a video, processing these frames using a vision model, and generating textual descriptions using GPT.



In [None]:
# Import necessary libraries
import os
import cv2
import torch
from PIL import Image
import clip
import openai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")


In [None]:
## Step 1: Extract Frames from the Video


In [None]:
def extract_frames(video_path, output_folder, interval=30):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    vidcap = cv2.VideoCapture(video_path)
    success, image = vidcap.read()
    count = 0

    while success:
        if count % interval == 0:
            frame_path = os.path.join(output_folder, f"frame_{count:03d}.jpg")
            cv2.imwrite(frame_path, image)
        success, image = vidcap.read()
        count += 1

    vidcap.release()

# Extract frames
video_path = '../data/raw_videos/sample_video.mp4'
output_folder = '../data/frames'
extract_frames(video_path, output_folder)


In [None]:
## Step 2: Process Frames using a Vision Model


In [None]:
def process_frames(frames_folder, output_folder):
    model, preprocess = clip.load("ViT-B/32")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for frame_name in os.listdir(frames_folder):
        frame_path = os.path.join(frames_folder, frame_name)
        image = Image.open(frame_path)
        image_input = preprocess(image).unsqueeze(0)

        with torch.no_grad():
            image_features = model.encode_image(image_input)

        features_path = os.path.join(output_folder, f"{os.path.splitext(frame_name)[0]}_features.pt")
        torch.save(image_features, features_path)

# Process frames
frames_folder = '../data/frames'
output_folder = '../data/processed_features'
process_frames(frames_folder, output_folder)
