In [5]:
# --- ONE-CELL OFFLINE VIT-GPT2 INFERENCE (JSON TIMELINE) ---
# Edit these 4 paths and run. No downloads. Uses a single .pth + local tokenizer.
VITGPT2_PTH         = r"weights\vitgpt2\vitgpt2.pth"      # your VisionEncoderDecoder state_dict
TOKENIZER_DIR       = r"weights\vitgpt2\tokenizer"        # must contain vocab.json + merges.txt
VIDEO_PATH          = r"..\..\..\..\sample_original.mp4"           # offline video to summarize
OUTPUT_JSON_PATH    = r"summary_output.json"              # where to save the JSON

# ---- Minimal imports (keep it light) ----
import os, json, time, cv2, numpy as np, torch
from PIL import Image
from torchvision import transforms
from torch.serialization import add_safe_globals
from transformers import (ViTConfig, ViTModel, GPT2Config, GPT2LMHeadModel,
                          VisionEncoderDecoderModel, GPT2TokenizerFast)

# ---- Safety for torch.load in PyTorch 2.6+ (allowlist numpy scalar) ----
add_safe_globals([np.core.multiarray.scalar])  # safe to allow if you trust the file

# ---- Basic checks ----
assert os.path.exists(VITGPT2_PTH), f"Missing: {VITGPT2_PTH}"
assert os.path.isfile(os.path.join(TOKENIZER_DIR, "vocab.json")), "Missing tokenizer vocab.json"
assert os.path.isfile(os.path.join(TOKENIZER_DIR, "merges.txt")), "Missing tokenizer merges.txt"
assert os.path.exists(VIDEO_PATH), f"Missing video: {VIDEO_PATH}"

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_num_threads(max(1, (os.cpu_count() or 4)//2))

# ==== 1) Build ViT-GPT2 architecture in code (no config.json / downloads) ====
# If you trained ViT-B/16, set PATCH_SIZE=16. For ViT-B/32, keep 32.
IMAGE_SIZE, PATCH_SIZE = 224, 32
HIDDEN_SIZE, NUM_LAYERS, NUM_HEADS, INTERMEDIATE_SIZE = 768, 12, 12, 3072

vit_cfg = ViTConfig(image_size=IMAGE_SIZE, patch_size=PATCH_SIZE,
                    hidden_size=HIDDEN_SIZE, num_hidden_layers=NUM_LAYERS,
                    num_attention_heads=NUM_HEADS, intermediate_size=INTERMEDIATE_SIZE)
gpt2_cfg = GPT2Config(n_positions=1024, n_ctx=1024, n_embd=HIDDEN_SIZE,
                      n_layer=NUM_LAYERS, n_head=NUM_HEADS,
                      bos_token_id=50256, eos_token_id=50256,
                      is_decoder=True, add_cross_attention=True, use_cache=True)

model = VisionEncoderDecoderModel(encoder=ViTModel(vit_cfg), decoder=GPT2LMHeadModel(gpt2_cfg))

# ==== 2) Load your .pth safely (weights_only=True) ====
sd = torch.load(VITGPT2_PTH, map_location="cpu", weights_only=True)
if isinstance(sd, dict) and "state_dict" in sd: sd = sd["state_dict"]
fixed = {}
for k, v in sd.items():
    nk = k
    for pref in ("module.", "model."):
        if nk.startswith(pref): nk = nk[len(pref):]
    fixed[nk] = v
model.load_state_dict(fixed, strict=False)
model.to(device).eval()
for p in model.parameters(): p.requires_grad = False

# ==== 3) Tokenizer from local files only (no downloads) ====
tokenizer = GPT2TokenizerFast.from_pretrained(TOKENIZER_DIR, local_files_only=True)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id

# ==== 4) Minimal preprocessing & caption helpers ====
# If you trained with ImageNet stats, swap Normalize to mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]
tfm = transforms.Compose([
    transforms.Resize(IMAGE_SIZE, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(IMAGE_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]),
])
GEN = dict(max_new_tokens=16, do_sample=False, num_beams=3)

@torch.inference_mode()
def caption_frame(rgb):
    px = tfm(Image.fromarray(rgb)).unsqueeze(0).to(device)
    ids = model.generate(pixel_values=px,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         **GEN)
    return tokenizer.decode(ids[0], skip_special_tokens=True).strip()

def video_meta(path):
    cap = cv2.VideoCapture(path)
    fps = float(cap.get(cv2.CAP_PROP_FPS) or 0.0)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    dur = (total / fps) if (fps > 0 and total > 0) else 0.0
    cap.release()
    return fps, total, dur

def frame_at_time(path, t):
    cap = cv2.VideoCapture(path)
    fps = float(cap.get(cv2.CAP_PROP_FPS) or 0.0)
    if fps <= 0:
        ok, frm = cap.read(); cap.release()
        if not ok: raise RuntimeError("Could not read frame")
        return frm
    idx = int(round(t * fps))
    cap.set(cv2.CAP_PROP_POS_FRAMES, max(0, idx))
    ok, frm = cap.read(); cap.release()
    if not ok or frm is None: raise RuntimeError(f"Could not read frame at {t:.2f}s")
    return frm

# ==== 5) Summarize one offline video into your JSON format ====
def summarize_video_json(path, window_sec=3.0, stride_sec=1.0, max_windows=16):
    _, _, dur = video_meta(path)
    if dur <= 0: dur = 6.0
    starts = np.arange(0.0, max(0.0, dur-1e-6), stride_sec)
    items, n = [], 0
    t0 = time.perf_counter()
    for s in starts:
        if n >= max_windows: break
        start, end = float(s), float(min(s+window_sec, dur))
        center = (start + end) / 2.0
        bgr = frame_at_time(path, center)
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        text = caption_frame(rgb)
        items.append([round(start,2), round(end,2), text])
        n += 1
    print(f"Segments: {len(items)} | Elapsed: {time.perf_counter()-t0:.2f}s on {device}")
    return {"items": items, "duration": round(dur, 2)}

# ==== 6) Run & print/save JSON ====
result = summarize_video_json(VIDEO_PATH, window_sec=3.0, stride_sec=1.0, max_windows=16)
os.makedirs(os.path.dirname(OUTPUT_JSON_PATH) or ".", exist_ok=True)
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print("Saved JSON to:", OUTPUT_JSON_PATH)
print(json.dumps(result, indent=2))

# ---- If (and only if) safe load still fails, you can fall back to:
# sd = torch.load(VITGPT2_PTH, map_location="cpu", weights_only=False)  # ONLY for trusted files


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy.dtype was not an allowed global by default. Please use `torch.serialization.add_safe_globals([numpy.dtype])` or the `torch.serialization.safe_globals([numpy.dtype])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.