# Minimal Cataract Video Q&A 
Disables ipywidgets/tqdm progress bars to work under `nbconvert` without a running frontend.

In [1]:

# Headless-friendly setup: disable ipywidgets/tqdm progress bars
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TQDM_DISABLE"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
try:
    from huggingface_hub.utils import disable_progress_bars
    disable_progress_bars()
except Exception:
    pass
print("Progress bars disabled for headless execution.")


Progress bars disabled for headless execution.


## ✅ Extra — Set paths

In [2]:

import os, pathlib
from IPython.display import Video

home = os.getenv("HOME", "~")
video_path = os.environ.get("VIDEO_PATH", f"{home}/surgery-sft/datasets/cataract1k/videos/test/case_4687_Capsulorhexis_52.50_57.50.mp4")
base_model_id = os.environ.get("BASE_MODEL_ID", "Qwen/Qwen2.5-VL-3B-Instruct")

print("Using video:   ", video_path)
print("Base model id: ", base_model_id)
assert pathlib.Path(video_path).expanduser().exists(), f"Video not found: {video_path}"


Using video:    /dss/dsshome1/03/ra58vim2/surgery-sft/datasets/cataract1k/videos/test/case_4687_Capsulorhexis_52.50_57.50.mp4
Base model id:  Qwen/Qwen2.5-VL-3B-Instruct


In [3]:
from IPython.display import Video; Video(video_path, embed=True)

## ✅ Extra — Type your question

In [4]:
question = "Which phase of the surgery are we currently at?"
print('Question:', question)

Question: Which phase of the surgery are we currently at?


In [5]:

# Load BASE model with low VRAM settings (quantized if CUDA is available)
import torch, os
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor, BitsAndBytesConfig

use_cuda = torch.cuda.is_available()
dtype = torch.bfloat16 if use_cuda else torch.float32

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type=torch.bfloat16,
) if use_cuda else None

print("Loading base:", base_model_id)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    base_model_id,
    device_map="auto" if use_cuda else None,
    torch_dtype=dtype,
    quantization_config=bnb_cfg,
).eval()

processor = Qwen2_5_VLProcessor.from_pretrained(base_model_id, padding_side="right", use_fast=True)
print("CUDA:", use_cuda)


Loading base: Qwen/Qwen2.5-VL-3B-Instruct


CUDA: True


In [6]:

# Sample a few frames safely on CPU
import decord, torch
decord.bridge.set_bridge('torch')

vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
n = vr._num_frame
fps = max(1, int(vr.get_avg_fps()) or 1)

target = min(4, max(1, n // max(1, fps)))  # <=4 frames
step = max(1, n // target)
idx = list(range(0, n, step))[:target]

frames = vr.get_batch(idx)  # (T, H, W, C)
if frames.shape[-1] == 4:
    frames = frames[..., :3]
frames = frames.permute(0, 3, 1, 2).contiguous().to(torch.uint8)
print(f"Frames: {tuple(frames.shape)} (T,C,H,W)")


Frames: (4, 3, 224, 224) (T,C,H,W)


In [7]:
# Build chat and generate
SYSTEM = "You analyze cataract surgery video frames and answer concisely based only on visible content."
convo = [
    {"role": "system", "content": [{"type": "text", "text": SYSTEM}]},
    {"role": "user", "content": [
        {"type": "video", "video": video_path},
        {"type": "text", "text": question},
    ]},
]
templated = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)

inputs = processor(
    text=[templated],
    videos=[frames],
    return_tensors="pt",
    padding=True,
)
inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}

with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False, temperature=0.0)

# Decode only the generated tokens (exclude the prompt)
generated = output_ids[:, inputs["input_ids"].shape[1]:]
answer = processor.batch_decode(generated, skip_special_tokens=True)[0].strip()

print("\nQ:", question)
print("A:", answer)



Q: Which phase of the surgery are we currently at?
A: We are in the initial phase of the cataract surgery, where the incision is being made to access the eye.


In [8]:
answer

'We are in the initial phase of the cataract surgery, where the incision is being made to access the eye.'