# Minimal Cataract Video Q&A — **Fine‑tuned (Headless‑Safe)**
Loads your fine‑tuned Qwen2.5‑VL if present (local adapters/full model or Hub).

In [None]:

# Headless-friendly setup: disable ipywidgets/tqdm progress bars
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TQDM_DISABLE"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
try:
    from huggingface_hub.utils import disable_progress_bars
    disable_progress_bars()
except Exception:
    pass
print("Progress bars disabled for headless execution.")


## ✅ Extra — Set paths / IDs
- `FINETUNED_DIR`: local output of `train.py --save_dir` *(for adapters use `--save_adapter` during training)*
- `HUB_MODEL_ID`: e.g. `yourname/qwen2.5-vl-7b-instruct-cataract1k` if you pushed to hub
- `BASE_MODEL_ID`: base model id (defaults to 7B Instruct)

In [None]:

import os, pathlib
from IPython.display import Video

home = os.getenv("HOME", "~")
video_path = os.environ.get("VIDEO_PATH", f"{home}/surgery-sft/datasets/cataract1k/videos/test/case_4687_Capsulorhexis_52.50_57.50.mp4")

# Where your fine-tuned assets live (local)
finetuned_dir = os.environ.get("FINETUNED_DIR", "./qwen2.5-vl-7b-instruct-cataract1k")

# If you pushed to the Hub during training
hub_model_id = os.environ.get("HUB_MODEL_ID", "")

# Base model to start from (if adapters are used)
base_model_id = os.environ.get("BASE_MODEL_ID", "Qwen/Qwen2.5-VL-7B-Instruct")

print("Video:         ", video_path)
print("FINETUNED_DIR: ", finetuned_dir)
print("HUB_MODEL_ID:  ", hub_model_id or "(none)")
print("BASE_MODEL_ID: ", base_model_id)

assert pathlib.Path(video_path).expanduser().exists(), f"Video not found: {video_path}"

# Show the video inline (harmless under nbconvert)
Video(video_path, embed=True)


## ✅ Extra — Type your question

In [None]:
question = "Which phase of the surgery are we currently at?"; print('Question:', question)

In [None]:

# Load base + fine-tuned (adapters/full/Hub) with low VRAM; fall back requires user to set BASE_MODEL_ID
import os, json, torch
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor, BitsAndBytesConfig
from peft import PeftModel

use_cuda = torch.cuda.is_available()
dtype = torch.bfloat16 if use_cuda else torch.float32

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type=torch.bfloat16,
) if use_cuda else None

print("Loading base:", base_model_id)
base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    base_model_id,
    device_map="auto" if use_cuda else None,
    dtype=dtype,
    quantization_config=bnb_cfg,
)

def try_load_peft(model_base, model_id_or_dir):
    try:
        m = PeftModel.from_pretrained(model_base, model_id_or_dir)
        return m.merge_and_unload()
    except Exception as e:
        print(f"PEFT load failed for '{model_id_or_dir}':", e)
        return None

def try_load_full(model_id_or_dir):
    try:
        return Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_id_or_dir,
            device_map="auto" if use_cuda else None,
            dtype=dtype,
            quantization_config=bnb_cfg,
        )
    except Exception as e:
        print(f"Direct full-model load failed for '{model_id_or_dir}':", e)
        return None

model = None
load_from = None

# 1) Try local adapters
if os.path.isdir(finetuned_dir):
    model = try_load_peft(base, finetuned_dir)
    if model:
        load_from = f"local adapters: {finetuned_dir}"
    else:
        # 2) Try local full model
        model = try_load_full(finetuned_dir)
        if model:
            load_from = f"local full model: {finetuned_dir}"

# 3) Try Hub if provided
if model is None and hub_model_id:
    model = try_load_peft(base, hub_model_id) or try_load_full(hub_model_id)
    if model:
        load_from = f"hub: {hub_model_id}"

if model is None:
    raise RuntimeError(
        "Could not find fine‑tuned weights.\n"
        f"Checked FINETUNED_DIR='{finetuned_dir}' and HUB_MODEL_ID='{hub_model_id}'.\n"
        "If you trained with QLoRA, ensure --save_adapter was used and FINETUNED_DIR contains adapter_config.json.\n"
        "Otherwise, set HUB_MODEL_ID to your pushed repo id.\n"
    )

processor = Qwen2_5_VLProcessor.from_pretrained(base_model_id, padding_side=\"right\", use_fast=True)
model.eval()
print(\"✅ Loaded fine‑tuned from:\", load_from)
print(\"CUDA:\", use_cuda)


In [None]:

# Sample a few frames safely on CPU
import decord, torch
decord.bridge.set_bridge('torch')

vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
n = vr._num_frame
fps = max(1, int(vr.get_avg_fps()) or 1)

target = min(4, max(1, n // max(1, fps)))  # <=4 frames
step = max(1, n // target)
idx = list(range(0, n, step))[:target]

frames = vr.get_batch(idx)  # (T, H, W, C)
if frames.shape[-1] == 4:
    frames = frames[..., :3]
frames = frames.permute(0, 3, 1, 2).contiguous().to(torch.uint8)
print(f\"Frames: {tuple(frames.shape)} (T,C,H,W)\")


In [None]:

# Build chat and generate (with proper prompt slicing)
SYSTEM = "You analyze cataract surgery video frames and answer concisely based only on visible content."
convo = [
    {"role": "system", "content": [{"type": "text", "text": SYSTEM}]},
    {"role": "user", "content": [
        {"type": "video", "video": video_path},
        {"type": "text", "text": question},
    ]},
]
templated = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)

inputs = processor(text=[templated], videos=[frames], return_tensors="pt", padding=True)
inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}

import torch
with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False, temperature=0.0)

# Slice off the prompt to get only the new tokens
gen_only = output_ids[:, inputs["input_ids"].shape[1]:]
answer = processor.batch_decode(gen_only, skip_special_tokens=True)[0].strip()

print("\\nQ:", question)
print("A:", answer)
