In [None]:
!pip install git+https://github.com/huggingface/transformers -q
!pip install qwen-vl-utils -q

In [None]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import numpy as np
import cv2
import torch
import json
import time
import h5py
from tqdm import tqdm
import warnings
import os
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
with open("/kaggle/input/zaloaic25/data/train/train.json", "r") as f:
    train_data = json.load(f)
video_paths = list(set(["/kaggle/input/zaloaic25/data/" + item['video_path'] for item in train_data['data']]))
video_paths[:5]

In [None]:
model_id = "Qwen/Qwen3-VL-4B-Instruct"
model = Qwen3VLForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto").eval()
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
vid_emb_path = "/kaggle/working/video_features.hdf5"
if not os.path.exists(vid_emb_path):
    with h5py.File(vid_emb_path, "w") as f:
        for video_path in tqdm(video_paths):
            messages = [
                {"role": "user", "content": [{"type": "video", "video": f"file://{video_path}"}]}
            ]
    
            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = process_vision_info(messages)
    
            inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt" )
    
            pixel_values_videos = inputs["pixel_values_videos"].unsqueeze(0).to(model.device)
            video_grid_thw = inputs["video_grid_thw"].to(model.device)
    
            with torch.no_grad(), torch.cuda.amp.autocast():
                video_emb, _ = model.get_video_features(pixel_values_videos, video_grid_thw)
            
            video_emb = video_emb[0].squeeze(0).cpu().numpy()
            video_emb = video_emb.mean(dim=1)
    
            key = os.path.basename(video_path)
            f.create_dataset(key, data=video_emb)
else: 
    with h5py.File(vid_emb_path, "r") as f:
        print(f"Number of stored videos: {len(f.keys())}")
        print("Sample keys:", list(f.keys())[:5])

        sample_key = list(f.keys())[0]
        print(f"\nSample dataset: {sample_key}")
        print("shape:", f[sample_key].shape)
        print("dtype:", f[sample_key].dtype)

In [None]:
embed_path = "/kaggle/working/text_features.hdf5"
meta_path = "/kaggle/working/text_metadata.json"

if not os.path.exists(embed_path):
    all_meta = {}

    with h5py.File(embed_path, "w") as f:
        qa_counter = 0

        for item in tqdm(train_data["data"], desc="Extracting text embeddings"):
            video_id = os.path.basename(item['video_path'])
            question = item['question']
            choices = [f"{question} {c}" for c in item['choices']]
            answer_text = f"{question} {item['answer']}"
            answer_idx = choices.index(answer_text)

            inputs = processor.tokenizer(choices, return_tensors="pt", padding=True, truncation=True, max_length=128,).to(model.device)

            with torch.no_grad():
                outputs = model.model.language_model(**inputs)
                hidden_states = outputs.last_hidden_state  # (batch, seq_len, dim)
                text_embeddings = hidden_states.mean(dim=1)  # Mean-pool over tokens
                text_embeddings = text_embeddings.to(torch.float16).cpu().numpy()

            qa_key = f"{video_id}_{qa_counter:04d}"
            f.create_dataset(qa_key, data=text_embeddings)

            # Store metadata
            all_meta[qa_key] = {
                "video_id": video_id,
                "question": question,
                "choices": item["choices"],
                "answer": answer_idx
            }

            qa_counter += 1

    with open(meta_path, "w", encoding="utf-8") as f_json:
        json.dump(all_meta, f_json, ensure_ascii=False, indent=2)

    print(f"âœ… Saved {qa_counter} QAs to {embed_path}")
    print(f"ðŸ§¾ Metadata saved to {meta_path}")
else:
    print(f"ðŸ“‚ Found existing file: {embed_path}")