In [None]:
# One-cell pipeline: Run LLaVA-NeXT-Video on top 50 NLQ clips and output model answers

!pip install --upgrade -q accelerate bitsandbytes
!pip install -q av
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
import json, os, zipfile
import torch
import numpy as np
import av
from google.colab import drive
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor

In [None]:
# Mount Drive
drive.mount('/content/drive')

In [None]:
json_filename = "/content/drive/MyDrive/Colab Notebooks/Egocentric Vision/Video_Query_to_Answer/inputs/labeled_top_50_queries.json"
clip_folder = "/content/drive/MyDrive/Colab Notebooks/Egocentric Vision/Video_Query_to_Answer/inputs/re_clips"
output_json = "/content/drive/MyDrive/Colab Notebooks/Egocentric Vision/Video_Query_to_Answer/outputs/LLaVA_NEXT_labeled_top_50_results.json"

In [None]:
# === LOAD MODEL ===
quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf", quantization_config=quant_config, device_map="auto"
)

In [None]:
# === VIDEO READER ===
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start, end = indices[0], indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end: break
        if i >= start and i in indices:
            frames.append(frame)
    return np.stack([f.to_ndarray(format="rgb24") for f in frames])

In [None]:
# === INFERENCE FUNCTION ===
def get_model_answer(video_path, question):
    container = av.open(video_path)
    total_frames = container.streams.video[0].frames
    indices = np.linspace(0, total_frames - 1, 8, dtype=int)
    clip = read_video_pyav(container, indices)

    conversation = [{"role": "user", "content": [{"type": "text", "text": question}, {"type": "video"}]}]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor([prompt], videos=[clip], padding=True, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=100, do_sample=True, top_p=0.9)
    return processor.batch_decode(output, skip_special_tokens=True)[0]

In [None]:
# === PROCESS ALL CLIPS ===
results = []
for entry in nlq_entries:
    clip_uid = entry["clip_uid"]
    query_idx = entry["query_idx"]
    question = entry["question"]
    gt_answer = entry["answer"]
    video_path = os.path.join(extract_dir, clip_folder, f"{clip_uid}_{query_idx}.mp4")

    try:
        model_answer = get_model_answer(video_path, question)
        results.append({
            "clip_uid": clip_uid,
            "query_idx": query_idx,
            "question": question,
            "gt_answer": gt_answer,
            "model_answer": model_answer
        })
        print(f"✅ {clip_uid}_{query_idx}")
    except Exception as e:
        print(f"❌ Error on {clip_uid}_{query_idx}: {e}")

In [None]:
# === SAVE OUTPUT ===
with open(output_json, "w") as f:
    json.dump(results, f, indent=2)

print(f"🎉 Done. Results saved to: {output_json}")