In [4]:
import os
import cv2
import torch
from PIL import Image
import numpy as np
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel

# ---- Config ----
VIDEO_PATH = "vid1.mov"
FRAME_SAVE_PATH = "frames/"
FRAME_INTERVAL = 60  # every N frames
MODEL_NAME = "openai/clip-vit-base-patch32"

# ---- Create frame directory ----
os.makedirs(FRAME_SAVE_PATH, exist_ok=True)

# ---- Load CLIP model ----
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)

# ---- Step 1: Extract Frames ----
def extract_frames(video_path, interval, output_dir):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    saved_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image_pil = Image.fromarray(frame_rgb)
            image_path = os.path.join(output_dir, f"frame_{saved_count:04d}.png")
            image_pil.save(image_path)
            saved_count += 1

        frame_count += 1
    cap.release()

# ---- Step 2: Get CLIP Embeddings ----
def get_clip_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.squeeze().cpu().numpy()

# ---- Run Pipeline ----
print("Extracting frames...")
extract_frames(VIDEO_PATH, FRAME_INTERVAL, FRAME_SAVE_PATH)

print("Computing embeddings...")
embeddings = []
image_files = sorted([f for f in os.listdir(FRAME_SAVE_PATH) if f.endswith(".png")])

for filename in image_files:
    path = os.path.join(FRAME_SAVE_PATH, filename)
    emb = get_clip_embedding(path)
    embeddings.append((filename, emb))

print(f"Computed embeddings for {len(embeddings)} frames")

# # Optional: Save embeddings to file
# np.save("clip_embeddings.npy", embeddings)
# print("Saved embeddings to clip_embeddings.npy")


Extracting frames...
Computing embeddings...
Computed embeddings for 9 frames


In [8]:
import os
import cv2
import torch
from PIL import Image
import numpy as np
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel
import faiss

# ---- Config ----
VIDEO_PATH = "sample_outdoor_video.mp4"
FRAME_SAVE_PATH = "frames/"
REFERENCE_DATA_PATH = "reference_images/"
EMBEDDING_SAVE_PATH = "clip_embeddings.npy"
INDEX_SAVE_PATH = "faiss_index.bin"
FRAME_INTERVAL = 60  # every N frames
MODEL_NAME = "openai/clip-vit-base-patch32"

# ---- Create directories ----
os.makedirs(FRAME_SAVE_PATH, exist_ok=True)
os.makedirs(REFERENCE_DATA_PATH, exist_ok=True)

# ---- Load CLIP model ----
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)

# ---- Step 1: Extract Frames ----
def extract_frames(video_path, interval, output_dir):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    saved_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image_pil = Image.fromarray(frame_rgb)
            image_path = os.path.join(output_dir, f"frame_{saved_count:04d}.png")
            image_pil.save(image_path)
            saved_count += 1

        frame_count += 1
    cap.release()

# ---- Step 2: Get CLIP Embeddings ----
def get_clip_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.squeeze().cpu().numpy()


# ---- Step 3: Build FAISS Index from Reference Images ----
def build_faiss_index(reference_dir):
    reference_embeddings = []
    filenames = []
    for file in sorted(os.listdir(reference_dir)):
        if file.endswith(".png") or file.endswith(".jpg"):
            filepath = os.path.join(reference_dir, file)
            emb = get_clip_embedding(filepath)
            reference_embeddings.append(emb)
            filenames.append(file)

    reference_embeddings = np.vstack(reference_embeddings).astype("float32")
    index = faiss.IndexFlatL2(reference_embeddings.shape[1])
    index.add(reference_embeddings)
    faiss.write_index(index, INDEX_SAVE_PATH)
    np.save("reference_filenames.npy", filenames)
    print("FAISS index built and saved.")
    return index, filenames

# ---- Step 4: Query Index with Video Frame Embeddings ----
def query_index(index, query_embeddings, filenames, k=3):
    results = []
    for frame_name, emb in query_embeddings:
        emb = emb.astype("float32").reshape(1, -1)
        D, I = index.search(emb, k)
        matched = [(filenames[i], float(D[0][j])) for j, i in enumerate(I[0])]
        results.append((frame_name, matched))
    return results

# ---- Run Pipeline ----
print("Extracting frames...")
extract_frames(VIDEO_PATH, FRAME_INTERVAL, FRAME_SAVE_PATH)

print("Computing embeddings for video frames...")
embeddings = []
image_files = sorted([f for f in os.listdir(FRAME_SAVE_PATH) if f.endswith(".png")])


for filename in image_files:
    path = os.path.join(FRAME_SAVE_PATH, filename)
    emb = get_clip_embedding(path)
    embeddings.append(emb)

final_embeddings_np = np.stack(embeddings)

np.save(EMBEDDING_SAVE_PATH, final_embeddings_np)
print(f"Saved embeddings to {EMBEDDING_SAVE_PATH}")

print("Building FAISS index from reference images...")
index, ref_filenames = build_faiss_index(REFERENCE_DATA_PATH)

print("Querying index with video frame embeddings...")
results = query_index(index, final_embeddings_np, ref_filenames)

for frame, matches in results:
    print(f"\nFrame: {frame}")
    for fname, dist in matches:
        print(f"  Match: {fname} | Distance: {dist:.2f}")


Extracting frames...
Computing embeddings for video frames...
Saved embeddings to clip_embeddings.npy
Building FAISS index from reference images...


ValueError: need at least one array to concatenate

In [1]:
import os
import cv2
import json
import numpy as np

from modules.driving_side import analyze_driving_side

# Config
VIDEO_PATH = "vid1.mov"
FRAME_SAVE_PATH = "frames/"
FRAME_INTERVAL = 30

# Prepare frame directory
os.makedirs(FRAME_SAVE_PATH, exist_ok=True)

# Step 1: Extract Frames

def extract_frames(video_path, interval, output_dir):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    saved_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % interval == 0:
            path = os.path.join(output_dir, f"frame_{saved_count:04d}.png")
            cv2.imwrite(path, frame)
            saved_count += 1

        frame_count += 1
    cap.release()

print("Extracting frames...")
extract_frames(VIDEO_PATH, FRAME_INTERVAL, FRAME_SAVE_PATH)

Extracting frames...


In [1]:
%load_ext autoreload
%autoreload 2 
from modules.language_ocr import detect_languages

FRAME_SAVE_PATH = "frames/"
# Step 2: Run All Modules
print("Running language OCR module...")
language_results = detect_languages(FRAME_SAVE_PATH)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Running language OCR module...


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


EasyOCR reader for Traditional Chinese initialized with GPU.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


EasyOCR reader for Simplified Chinese initialized with GPU.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


EasyOCR reader for Japanese initialized with GPU.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


EasyOCR reader for Korean initialized with GPU.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


EasyOCR reader for Russian initialized with GPU.


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


EasyOCR reader for Arabic initialized with GPU.


Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% CompleteEasyOCR reader for fr, de, en initialized with GPU.

Processing file: frames/frame_0000.png
  No text detected in frame_0000.png by any reader.

Processing file: frames/frame_0001.png
  No text detected in frame_0001.png by any reader.

Processing file: frames/frame_0002.png
  No text detected in frame_0002.png by any reader.

Processing file: frames/frame_0003.png
  No text detected in frame_0003.png by any reader.

Processing file: frames/frame_0004.png
  No text detected in frame_0004.png by any reader.

Processing file: frames/frame_0005.png


[W524 15:44:15.048569221 NNPACK.cpp:57] Could not initialize NNPACK! Reason: Unsupported hardware.


: 

: 

In [None]:
print("Running driving side analysis module...")
driving_result = analyze_driving_side(VIDEO_PATH)

In [None]:
# Step 3: Inference Logic
results = {
    "language": language_results,
    "driving_side": driving_result,
    "notes": []
}

if "Traditional Chinese" in language_results:
    results["notes"].append("Detected Traditional Chinese → Possibly Hong Kong or Taiwan")

if driving_result == "right":
    results["notes"].append("Right side driving → Eliminate UK, India, Japan")

# Print Results
print(json.dumps(results, indent=2))