In [1]:
import tensorflow as tf
print('TF:', tf.__version__)
print('GPU:', tf.config.list_physical_devices('GPU'))
print('Build CUDA/cuDNN:', tf.sysconfig.get_build_info().get('cuda_version'), tf.sysconfig.get_build_info().get('cudnn_version'))

2025-11-02 14:15:19.399379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762092919.601178      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762092919.659884      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TF: 2.18.0
GPU: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Build CUDA/cuDNN: 12.5.1 9


In [2]:
!pip show nvidia-cudnn-cu12

Name: nvidia-cudnn-cu12
Version: 9.3.0.75
Summary: cuDNN runtime libraries
Home-page: https://developer.nvidia.com/cuda-zone
Author: Nvidia CUDA Installer Team
Author-email: compute_installer@nvidia.com
License: NVIDIA Proprietary Software
Location: /usr/local/lib/python3.11/dist-packages
Requires: nvidia-cublas-cu12
Required-by: torch


In [3]:
# %% Cell 1: Environment setup (Training)
import os, json, numpy as np, tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import re

os.environ.setdefault('TF_CPP_MIN_LOG_LEVEL','2')
os.environ.setdefault('TF_ENABLE_ONEDNN_OPTS','0')

# GPU memory growth (optional)
try:
  gpus = tf.config.list_physical_devices('GPU')
  if gpus:
    for gpu in gpus:
      try:
        tf.config.experimental.set_memory_growth(gpu, True)
      except Exception:
        pass
    print('TensorFlow GPUs:', gpus)
  else:
    print('No GPU detected for TensorFlow, using CPU.')
except Exception as e:
  print('GPU config error (TensorFlow):', e)

SEED=42; import random
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

def resolve_proc_dir():
  env = os.environ.get('PROC_DIR')
  if env and os.path.exists(os.path.join(env, 'splits.json')):
    return env
  # Kaggle dataset path (as shown in your notebook)
  kaggle_default = '/kaggle/input/my-video-feature/processed'
  if os.path.exists(os.path.join(kaggle_default, 'splits.json')):
    return kaggle_default
  # Search other Kaggle inputs for a processed folder
  try:
    import glob
    for p in glob.glob('/kaggle/input/*/processed'):
      if os.path.exists(os.path.join(p, 'splits.json')):
        return p
  except Exception:
    pass
  # Kaggle working dir
  kaggle_working = '/kaggle/working/processed'
  if os.path.exists(os.path.join(kaggle_working, 'splits.json')):
    return kaggle_working
  # Local default
  local = 'processed'
  if os.path.exists(os.path.join(local, 'splits.json')):
    return local
  raise FileNotFoundError("Không tìm thấy processed/splits.json. Hãy set PROC_DIR trỏ tới thư mục chứa 'splits.json'.")

PROC_DIR = resolve_proc_dir()
print(f"PROC_DIR: {PROC_DIR}")

TensorFlow GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
PROC_DIR: /kaggle/input/my-video-feature/processed


In [4]:
# %% Cell 2: Load processed artifacts
with open(os.path.join(PROC_DIR,'splits.json'),'r') as f:
  splits=json.load(f)
train_ids=splits['train']; val_ids=splits['val']; test_ids=splits['test']
print(f"Loaded splits -> train={len(train_ids)} | val={len(val_ids)} | test={len(test_ids)}")

with open(os.path.join(PROC_DIR,'tokenizer','tokenizer.json'),'r') as f:
  Tok=tokenizer_from_json(f.read())
with open(os.path.join(PROC_DIR,'tokenizer','meta.json'),'r') as f:
  meta=json.load(f)
vocab_size=int(meta['vocab_size']); max_len=int(meta['max_len']); SPECIAL=meta['special']
print(f"Tokenizer: vocab_size={vocab_size} | max_len={max_len}")

# Load cleaned annotations
with open(os.path.join(PROC_DIR,'annotations.json'),'r') as f:
  annotations=json.load(f)

# infer feature dimension
feat_meta_path=os.path.join(PROC_DIR,'meta.json')
feat_dim=None
if os.path.exists(feat_meta_path):
  try:
    feat_dim=int(json.load(open(feat_meta_path))['feat_dim'])
  except Exception:
    feat_dim=None
if feat_dim is None:
  # Fallback: read one saved train feature to infer dim
  for vid in train_ids:
    p=os.path.join(PROC_DIR,'features','train',f"{vid}.npy")
    if os.path.exists(p):
      try:
        arr=np.load(p, mmap_mode='r'); feat_dim=int(arr.shape[0]); break
      except Exception:
        pass
  if feat_dim is None: feat_dim=768
print(f"Feature dim: {feat_dim}")

Loaded splits -> train=1379 | val=295 | test=296
Tokenizer: vocab_size=10364 | max_len=47
Feature dim: 768


In [5]:
# %% Cell 3: Build training pairs from processed features
start_idx=Tok.word_index.get(SPECIAL['start']); end_idx=Tok.word_index.get(SPECIAL['end'])

def seq_pairs_for_split(ids):
  Xv=[]; Xs=[]; y=[]
  for vid in tqdm(ids, desc='pairs', unit='vid'):
    split_dir = 'train' if vid in train_ids else ('val' if vid in val_ids else 'test')
    vpath=os.path.join(PROC_DIR,'features',split_dir,f"{vid}.npy")
    if not os.path.exists(vpath):
      continue
    vf=np.load(vpath, mmap_mode='r')
    if vf.ndim>1: vf=vf.reshape(-1)
    if vf.shape[0]!=feat_dim:
      continue
    for cap in annotations.get(vid, []):
      txt=f"{SPECIAL['start']} {cap} {SPECIAL['end']}"
      seq=Tok.texts_to_sequences([txt])[0]
      for i in range(1,len(seq)):
        in_seq=pad_sequences([seq[:i]],maxlen=max_len, padding='post')[0]
        out=seq[i]
        Xv.append(vf.astype(np.float32)); Xs.append(in_seq.astype(np.int32)); y.append(np.int32(out))
  Xv=np.stack(Xv).astype(np.float32) if len(Xv)>0 else np.zeros((0,feat_dim),dtype=np.float32)
  Xs=np.stack(Xs).astype(np.int32) if len(Xs)>0 else np.zeros((0,max_len),dtype=np.int32)
  y=np.array(y,dtype=np.int32)
  return Xv, Xs, y

Xv_tr,Xs_tr,y_tr=seq_pairs_for_split(train_ids)
Xv_val,Xs_val,y_val=seq_pairs_for_split(val_ids)

pairs:   0%|          | 0/1379 [00:00<?, ?vid/s]

pairs:   0%|          | 0/295 [00:00<?, ?vid/s]

In [6]:
# %% Cell 4: Model (visual + text fusion)
vi=layers.Input(shape=(feat_dim,),name='visual')
vb=layers.Dense(256,activation='relu')(vi); vb=layers.Dropout(0.5)(vb)
si=layers.Input(shape=(max_len,),name='seq_in')
emb=layers.Embedding(input_dim=vocab_size, output_dim=256, mask_zero=True)(si)
lstm=layers.LSTM(256, unroll=True)(emb)
fusion=layers.Add()([vb,lstm])
hd=layers.Dense(256,activation='relu')(fusion)
out=layers.Dense(vocab_size)(hd)
model=models.Model(inputs=[vi,si],outputs=out)

# Optimizer & compile (stable settings)
try: tf.config.optimizer.set_jit(False)
except Exception: pass
try:
  from tensorflow.keras import mixed_precision
  mixed_precision.set_global_policy('float32')
except Exception: pass

# Learning Rate Schedule: WarmUp + Cosine
class WarmUpCosine(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, base_lr, warmup_steps, total_steps, min_lr=1e-6):
    self.base_lr = float(base_lr)
    self.warmup_steps = int(max(1, warmup_steps))
    self.total_steps = int(max(self.warmup_steps + 1, total_steps))
    self.min_lr = float(min_lr)
  def __call__(self, step):
    step = tf.cast(step, tf.float32)
    warm = tf.cast(self.warmup_steps, tf.float32)
    total = tf.cast(self.total_steps, tf.float32)
    # Linear warmup to base_lr
    warmup_lr = self.base_lr * (step / tf.maximum(1.0, warm))
    # Cosine decay from base_lr to min_lr
    progress = tf.clip_by_value((step - warm) / tf.maximum(1.0, total - warm), 0.0, 1.0)
    cosine = 0.5 * (1.0 + tf.cos(tf.constant(np.pi) * progress))
    cosine_lr = self.min_lr + (self.base_lr - self.min_lr) * cosine
    return tf.where(step < warm, warmup_lr, cosine_lr)
  def get_config(self):
    return {
      'base_lr': self.base_lr,
      'warmup_steps': self.warmup_steps,
      'total_steps': self.total_steps,
      'min_lr': self.min_lr,
    }
  @classmethod
  def from_config(cls, config):
    return cls(**config)

# Define default batch size and epochs early for schedule computation
bs = 64 if 'bs' not in globals() else bs
epochs = 30 if 'epochs' not in globals() else epochs
n_tr = int(y_tr.shape[0]) if hasattr(y_tr, 'shape') else len(y_tr)
steps_per_epoch = max(1, int(np.ceil(n_tr / bs)))
_total_steps = steps_per_epoch * epochs
schedule = WarmUpCosine(base_lr=3e-4, warmup_steps=max(1000, int(0.1 * _total_steps)), total_steps=_total_steps, min_lr=1e-6)
opt=tf.keras.optimizers.Adam(learning_rate=schedule, clipnorm=1.0)

# Add Top-K metric
topk_metric = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='sparse_top_k_categorical_accuracy')
model.compile(optimizer=opt,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['sparse_categorical_accuracy', topk_metric],
              steps_per_execution=16,
              run_eagerly=False)

I0000 00:00:1762092957.319310      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [7]:
# %% Cell 5: Train
bs=64; epochs=30
# Ensure pairs exist if this cell runs standalone
if 'y_tr' not in globals() or 'Xv_tr' not in globals() or 'Xs_tr' not in globals():
  Xv_tr, Xs_tr, y_tr = seq_pairs_for_split(train_ids)
if 'y_val' not in globals() or 'Xv_val' not in globals() or 'Xs_val' not in globals():
  Xv_val, Xs_val, y_val = seq_pairs_for_split(val_ids)

n_tr = int(y_tr.shape[0]) if hasattr(y_tr, 'shape') else len(y_tr)
n_val = int(y_val.shape[0]) if hasattr(y_val, 'shape') else len(y_val)
print(f"Train samples: {n_tr} | Val samples: {n_val}")
print("Train dtypes/shapes:", Xv_tr.dtype, Xs_tr.dtype, y_tr.dtype, Xv_tr.shape, Xs_tr.shape, y_tr.shape)
print("Val dtypes/shapes:", Xv_val.dtype, Xs_val.dtype, y_val.dtype, Xv_val.shape, Xs_val.shape, y_val.shape)

ds_train = tf.data.Dataset.from_tensor_slices(((Xv_tr, Xs_tr), y_tr)).shuffle(min(n_tr, 10000)).batch(bs).prefetch(tf.data.AUTOTUNE)
ds_val = tf.data.Dataset.from_tensor_slices(((Xv_val, Xs_val), y_val)).batch(bs).prefetch(tf.data.AUTOTUNE) if n_val>0 else None

os.makedirs('checkpoints', exist_ok=True)
callbacks = [tf.keras.callbacks.ModelCheckpoint(os.path.join('checkpoints','best.h5'), save_best_only=True),
             tf.keras.callbacks.CSVLogger(os.path.join('checkpoints','training_log.csv'), append=False)]
class LrTracker(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    lr_obj = self.model.optimizer.learning_rate
    try:
      # If schedule, evaluate at current step
      if isinstance(lr_obj, tf.keras.optimizers.schedules.LearningRateSchedule):
        step = tf.cast(self.model.optimizer.iterations, tf.float32)
        lr_val = float(lr_obj(step).numpy())
      else:
        lr_val = float(tf.keras.backend.get_value(lr_obj))
    except Exception:
      lr_val = float(getattr(self.model.optimizer, 'lr', 0.0))
    print(f"Epoch {epoch+1}: lr={lr_val:.6f}")
    if logs is not None: logs['lr'] = lr_val
callbacks.append(LrTracker())
if n_val>0:
  callbacks.insert(0, tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True))
  val_data=ds_val
else:
  callbacks.insert(0, tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, restore_best_weights=True))
  val_data=None
# NOTE: Remove ReduceLROnPlateau since we use a per-step schedule
callbacks.append(tf.keras.callbacks.LambdaCallback(
  on_epoch_end=lambda epoch, logs: print(
    f"Epoch {epoch+1}: perp={np.exp(logs['loss']):.3f}" + (
      f", val_perp={np.exp(logs.get('val_loss', np.nan)):.3f}" if 'val_loss' in logs and logs['val_loss'] is not None else ""
    )
  )
))

if n_tr==0:
  raise ValueError("Không có mẫu huấn luyện nào (train samples = 0). Hãy chắc thư mục processed có features và annotations.")
h = model.fit(ds_train, validation_data=val_data, epochs=epochs, callbacks=callbacks, verbose=1)
model.save(os.path.join('checkpoints','final.h5'))

Train samples: 453665 | Val samples: 96329
Train dtypes/shapes: float32 int32 int32 (453665, 768) (453665, 47) (453665,)
Val dtypes/shapes: float32 int32 int32 (96329, 768) (96329, 47) (96329,)
Epoch 1/30


I0000 00:00:1762092982.440739      97 service.cc:148] XLA service 0x7ca8f0008450 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1762092982.441471      97 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1762092985.015503      97 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   1/7089[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m67:47:58[0m 34s/step - loss: 9.2467 - sparse_categorical_accuracy: 0.0000e+00 - sparse_top_k_categorical_accuracy: 0.0000e+00

I0000 00:00:1762092997.556430      97 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m7089/7089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 6.6398 - sparse_categorical_accuracy: 0.1717 - sparse_top_k_categorical_accuracy: 0.3237Epoch 1: lr=0.000100
Epoch 1: perp=183.556, val_perp=68.553
[1m7089/7089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 13ms/step - loss: 6.6396 - sparse_categorical_accuracy: 0.1718 - sparse_top_k_categorical_accuracy: 0.3237 - val_loss: 4.2276 - val_sparse_categorical_accuracy: 0.3623 - val_sparse_top_k_categorical_accuracy: 0.5258 - lr: 1.0000e-04
Epoch 2/30
[1m7073/7089[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - loss: 3.7321 - sparse_categorical_accuracy: 0.3825 - sparse_top_k_categorical_accuracy: 0.5736Epoch 2: lr=0.000200
Epoch 2: perp=33.667, val_perp=44.791
[1m7089/7089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 9ms/step - loss: 3.7315 - sparse_categorical_accuracy: 0.3825 - sparse_top_k_categorical_accuracy: 0.5737 - val_loss: 3.8020 - val_sparse_categorical_a

In [8]:
# %% Cell 6: Save training plots
os.makedirs('training_plots', exist_ok=True)
hist=h.history
plt.figure(figsize=(7,4)); plt.plot(hist['loss'], label='loss')
if 'val_loss' in hist: plt.plot(hist['val_loss'], label='val_loss')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.tight_layout(); plt.savefig(os.path.join('training_plots','loss_curves.png')); plt.close()
plt.figure(figsize=(7,4)); plt.plot(hist.get('sparse_categorical_accuracy', []), label='acc')
if 'val_sparse_categorical_accuracy' in hist: plt.plot(hist['val_sparse_categorical_accuracy'], label='val_acc')
if 'sparse_top_k_categorical_accuracy' in hist: plt.plot(hist['sparse_top_k_categorical_accuracy'], label='top5_acc')
if 'val_sparse_top_k_categorical_accuracy' in hist: plt.plot(hist['val_sparse_top_k_categorical_accuracy'], label='val_top5_acc')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.legend(); plt.tight_layout(); plt.savefig(os.path.join('training_plots','accuracy_curves.png')); plt.close()
plt.figure(figsize=(7,4)); plt.plot(np.exp(hist['loss']), label='perplexity')
if 'val_loss' in hist: plt.plot(np.exp(hist['val_loss']), label='val_perplexity')
plt.xlabel('Epoch'); plt.ylabel('Perplexity'); plt.legend(); plt.tight_layout(); plt.savefig(os.path.join('training_plots','perplexity_lr.png')); plt.close()

In [9]:
!pip install -U "pyarrow>=21.0.0" "pydantic<2.12,>=2.0"
!pip uninstall -y libcugraph-cu12 pylibcugraph-cu12 libraft-cu12 pylibraft-cu12 rmm-cu12
!pip install -U open_clip_torch transformers

Collecting pyarrow>=21.0.0
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting pydantic<2.12,>=2.0
  Downloading pydantic-2.11.10-py3-none-any.whl.metadata (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic-core==2.33.2 (from pydantic<2.12,>=2.0)
  Downloading pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pydantic-2.11.10-py3-none-any.whl (444 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.8/444.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB

In [15]:
# %% Cell 7: Decode và đánh giá (BLEU/METEOR)
idx2word = {i: w for w, i in Tok.word_index.items()}
idx2word[0] = '<pad>'


def clean_text(t):
    t = re.sub(r"[^a-z0-9 ]+", " ", t.lower()).strip()
    t = re.sub(r"\s+", " ", t)
    return t

def beam_search_decode(vf, beam_size=5, max_len=max_len):
    beams = [([start_idx], 0.0)]  # (sequence, logprob)
    for _ in range(max_len - 1):
        new_beams = []
        for seq, score in beams:
            # Stop expanding ended sequences
            if seq[-1] == end_idx:
                new_beams.append((seq, score))
                continue
            in_seq = pad_sequences([seq], maxlen=max_len, padding='post')[0]
            logits = model.predict([vf[np.newaxis, :], in_seq[np.newaxis, :]], verbose=0)[0]
            probs = tf.nn.softmax(logits).numpy()
            k = min(beam_size, probs.shape[0])
            # top-k ids sorted by prob desc
            topk_ids = np.argpartition(probs, -k)[-k:]
            topk_ids = topk_ids[np.argsort(probs[topk_ids])[::-1]]
            for tid in topk_ids:
                tid = int(tid)
                new_seq = seq + [tid]
                new_score = score + float(np.log(probs[tid] + 1e-9))
                new_beams.append((new_seq, new_score))
        # prune
        new_beams.sort(key=lambda x: x[1], reverse=True)
        beams = new_beams[:beam_size]
        if all(s[-1] == end_idx for s, _ in beams):
            break
    # choose best ended if available
    ended = [(s, sc) for s, sc in beams if s[-1] == end_idx]
    best_seq = (ended[0][0] if ended else beams[0][0])
    tokens = []
    for tid in best_seq[1:]:
        if tid == end_idx:
            break
        tokens.append(idx2word.get(int(tid), ''))
    return " ".join(tokens).strip()

# ============== CLIP rerank (đơn giản, không thêm length/repetition/no_repeat/min_len) ==============
_clip_cache = {}

def _ensure_clip_model_for_dim(feat_dim):
    """Tải CLIP text encoder với fallback: open_clip -> HuggingFace -> openai/clip.
    Chọn biến thể phù hợp với feat_dim (512 hoặc 768).
    """
    if _clip_cache.get('model') is not None:
        return _clip_cache
    try:
        import torch
        dev = 'cuda' if torch.cuda.is_available() else ('mps' if getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available() else 'cpu')
    except Exception:
        dev = 'cpu'

    # 1) open_clip
    try:
        import torch
        import open_clip
        # Align with preprocessing: ViT-L-14-336 + openai weights when feat_dim≈768
        model_name = 'ViT-L-14-336' if int(feat_dim) >= 768 else 'ViT-B-32'
        model, _, _preprocess = open_clip.create_model_and_transforms(model_name, pretrained='openai', force_quick_gelu=True)
        model = model.to(dev)
        tokenizer = open_clip.get_tokenizer(model_name)
        _clip_cache.update({'backend': 'open_clip', 'device': dev, 'model': model, 'tokenize': tokenizer, 'model_name': model_name})
        return _clip_cache
    except Exception:
        pass

    # 2) HuggingFace transformers
    try:
        import torch
        from transformers import CLIPModel, CLIPTokenizer
        os.makedirs(os.path.join('checkpoints', 'hf_cache'), exist_ok=True)
        cache_dir = os.path.join('checkpoints', 'hf_cache')
        hf_name = 'openai/clip-vit-large-patch14-336' if int(feat_dim) >= 768 else 'openai/clip-vit-base-patch32'
        hf_tok = CLIPTokenizer.from_pretrained(hf_name, cache_dir=cache_dir)
        hf_mod = CLIPModel.from_pretrained(hf_name, cache_dir=cache_dir)
        hf_mod = hf_mod.to(dev)
        _clip_cache.update({'backend': 'hf', 'device': dev, 'hf_model': hf_mod, 'hf_tokenizer': hf_tok, 'model_name': hf_name})
        return _clip_cache
    except Exception:
        pass

    # 3) openai/CLIP (pip package 'clip')
    try:
        import torch
        import clip as openai_clip
        model_name = 'ViT-L/14' if int(feat_dim) >= 768 else 'ViT-B/32'
        model, _ = openai_clip.load(model_name, device=dev)
        _clip_cache.update({'backend': 'openai_clip', 'device': dev, 'model': model, 'tokenize': openai_clip.tokenize, 'model_name': model_name})
        return _clip_cache
    except Exception:
        pass

    _clip_cache.clear()
    return _clip_cache

def beam_search_candidates_simple(vf, beam_size=5, max_len=max_len, top_k=5):
    """Sinh một danh sách ứng viên (caption, logprob) từ beam search cơ bản (không penalty)."""
    beams = [([start_idx], 0.0)]
    for _ in range(max_len - 1):
        new_beams = []
        for seq, score in beams:
            if seq[-1] == end_idx:
                new_beams.append((seq, score))
                continue
            in_seq = pad_sequences([seq], maxlen=max_len, padding='post')[0]
            logits = model.predict([vf[np.newaxis, :], in_seq[np.newaxis, :]], verbose=0)[0]
            probs = tf.nn.softmax(logits).numpy()
            k = min(beam_size, probs.shape[0])
            topk_ids = np.argpartition(probs, -k)[-k:]
            topk_ids = topk_ids[np.argsort(probs[topk_ids])[::-1]]
            for tid in topk_ids:
                tid = int(tid)
                new_seq = seq + [tid]
                new_score = score + float(np.log(probs[tid] + 1e-9))
                new_beams.append((new_seq, new_score))
        new_beams.sort(key=lambda x: x[1], reverse=True)
        beams = new_beams[:beam_size]
        if all(s[-1] == end_idx for s, _ in beams):
            break
    beams.sort(key=lambda x: x[1], reverse=True)
    cands = []
    seen = set()
    for seq, sc in beams:
        tokens = []
        for tid in seq[1:]:
            if tid == end_idx:
                break
            tokens.append(idx2word.get(int(tid), ''))
        cap = " ".join(tokens).strip()
        if cap and cap not in seen:
            cands.append((cap, sc))
            seen.add(cap)
        if len(cands) >= top_k:
            break
    return cands

def clip_rerank_captions(vf, captions):
    """Chọn caption tốt nhất theo cosine similarity CLIP giữa vf và text.
    Nếu không tải được CLIP thì trả về None.
    """
    try:
        cache = _ensure_clip_model_for_dim(vf.shape[0])
        if not cache:
            return None
        import numpy as np
        import torch
        dev = cache.get('device', 'cpu')
        # Chuẩn hóa vf
        v = torch.tensor(vf, dtype=torch.float32, device=dev)
        v = v / (v.norm() + 1e-9)

        backend = cache.get('backend')
        if backend == 'open_clip':
            model = cache['model']
            tokenize = cache['tokenize']
            texts = tokenize(captions)
            texts = texts.to(dev)
            with torch.no_grad():
                tf = model.encode_text(texts)
            tf = tf / (tf.norm(dim=-1, keepdim=True) + 1e-9)
            if not _clip_cache.get('logged'):
                try:
                    print(f"CLIP reranker init: backend={backend} model={cache.get('model_name')} device={dev} text_dim={tf.shape[-1]} video_dim={v.shape[0]} match={tf.shape[-1]==v.shape[0]}")
                except Exception:
                    pass
                _clip_cache['logged'] = True
            # Only compare when embedding dimensions match to avoid space misalignment
            if tf.shape[-1] != v.shape[0]:
                return None
            scores = (tf * v.unsqueeze(0)).sum(dim=-1)
            best = int(torch.argmax(scores).item())
            return captions[best]
        elif backend == 'hf':
            hf_model = cache['hf_model']
            hf_tok = cache['hf_tokenizer']
            inputs = hf_tok(captions, padding=True, return_tensors='pt')
            inputs = {k: v.to(dev) for k, v in inputs.items()}
            with torch.no_grad():
                tf = hf_model.get_text_features(**inputs)
            tf = tf / (tf.norm(dim=-1, keepdim=True) + 1e-9)
            if not _clip_cache.get('logged'):
                try:
                    print(f"CLIP reranker init: backend={backend} model={cache.get('model_name')} device={dev} text_dim={tf.shape[-1]} video_dim={v.shape[0]} match={tf.shape[-1]==v.shape[0]}")
                except Exception:
                    pass
                _clip_cache['logged'] = True
            if tf.shape[-1] != v.shape[0]:
                return None
            scores = (tf * v.unsqueeze(0)).sum(dim=-1)
            best = int(torch.argmax(scores).item())
            return captions[best]
        elif backend == 'openai_clip':
            model = cache['model']
            tokenize = cache['tokenize']
            texts = tokenize(captions).to(dev)
            import torch
            with torch.no_grad():
                tf = model.encode_text(texts)
            tf = tf / (tf.norm(dim=-1, keepdim=True) + 1e-9)
            if not _clip_cache.get('logged'):
                try:
                    print(f"CLIP reranker init: backend={backend} model={cache.get('model_name')} device={dev} text_dim={tf.shape[-1]} video_dim={v.shape[0]} match={tf.shape[-1]==v.shape[0]}")
                except Exception:
                    pass
                _clip_cache['logged'] = True
            if tf.shape[-1] != v.shape[0]:
                return None
            scores = (tf * v.unsqueeze(0)).sum(dim=-1)
            best = int(torch.argmax(scores).item())
            return captions[best]
        else:
            return None
    except Exception:
        return None

def decode_with_clip_rerank(vf, beam_size=5, top_k=5):
    """Sinh ứng viên bằng beam search đơn giản và dùng CLIP để chọn caption tốt nhất.
    Nếu CLIP không sẵn sàng, fallback về beam_search_decode.
    """
    cands = beam_search_candidates_simple(vf, beam_size=beam_size, top_k=max(beam_size, top_k))
    if len(cands) == 0:
        return beam_search_decode(vf, beam_size=beam_size)
    caps = [c for c, _ in cands]
    pick = clip_rerank_captions(vf, caps)
    return pick if isinstance(pick, str) and len(pick) > 0 else caps[0]

try:
    import nltk
    from nltk.translate.bleu_score import corpus_bleu
    from nltk.translate.meteor_score import meteor_score
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)

    # Validation set
    refs_val, hyps_val, meteor_vals = [], [], []
    for vid in val_ids:
        vpath = os.path.join(PROC_DIR, 'features', 'val', f"{vid}.npy")
        if not os.path.exists(vpath):
            continue
        vf = np.load(vpath)
        hyp = decode_with_clip_rerank(vf, beam_size=5, top_k=5)
        hyp_tokens = hyp.split()
        hyps_val.append(hyp_tokens)
        ref_tokens = [clean_text(c).split() for c in annotations.get(vid, [])]
        if not ref_tokens:
            continue
        refs_val.append(ref_tokens)
        meteor_vals.append(
            meteor_score(ref_tokens, hyp_tokens)
        )
    bleu4_val = (
        corpus_bleu(refs_val, hyps_val, weights=(0.25, 0.25, 0.25, 0.25))
        if refs_val and hyps_val else 0.0
    )
    meteor_val = float(np.mean(meteor_vals)) if meteor_vals else 0.0
    print(f"BLEU-4 (val): {bleu4_val:.4f} | METEOR (val): {meteor_val:.4f}")

    # Test set
    refs_test, hyps_test, meteor_tests = [], [], []
    for vid in test_ids:
        vpath = os.path.join(PROC_DIR, 'features', 'test', f"{vid}.npy")
        if not os.path.exists(vpath):
            continue
        vf = np.load(vpath)
        hyp = decode_with_clip_rerank(vf, beam_size=5, top_k=5)
        hyp_tokens = hyp.split()
        hyps_test.append(hyp_tokens)
        ref_tokens = [clean_text(c).split() for c in annotations.get(vid, [])]
        if not ref_tokens:
            continue
        refs_test.append(ref_tokens)
        meteor_tests.append(
            meteor_score(ref_tokens, hyp_tokens)
        )
    bleu4_test = (
        corpus_bleu(refs_test, hyps_test, weights=(0.25, 0.25, 0.25, 0.25))
        if refs_test and hyps_test else 0.0
    )
    meteor_test = float(np.mean(meteor_tests)) if meteor_tests else 0.0
    print(f"BLEU-4 (test): {bleu4_test:.4f} | METEOR (test): {meteor_test:.4f}")

    os.makedirs('checkpoints', exist_ok=True)
    with open(os.path.join('checkpoints', 'eval_metrics.json'), 'w') as f:
        json.dump({
            'bleu4_val': float(bleu4_val),
            'meteor_val': float(meteor_val),
            'bleu4_test': float(bleu4_test),
            'meteor_test': float(meteor_test)
        }, f, indent=2)
except Exception as e:
    print('BLEU/METEOR eval skipped:', e)

CLIP reranker init: backend=open_clip model=ViT-L-14-336 device=cuda text_dim=768 video_dim=768 match=True
BLEU-4 (val): 0.4724 | METEOR (val): 0.7018
BLEU-4 (test): 0.5163 | METEOR (test): 0.7029


In [16]:
# %% Cell 8: Sample predictions on TEST (hiển thị video)
import random as _rnd

def _resolve_video_path(vid):
    """Cố gắng tìm đường dẫn video từ id.
    Ưu tiên Kaggle input nếu có, sau đó local data/videos và Test_Videos_New.
    """
    # 0) Kaggle input
    try:
        kaggle_root = '/kaggle/input'
        if os.path.exists(kaggle_root):
            # dataset thường gặp: msvd-clips/YouTubeClips
            p0 = os.path.join(kaggle_root, 'msvd-clips', 'YouTubeClips', f"{vid}.avi")
            if os.path.exists(p0):
                return p0
            import glob as _glob
            gk = _glob.glob(os.path.join(kaggle_root, 'msvd-clips', '**', f"{vid}*.avi"), recursive=True)
            if gk:
                return gk[0]
            # fallback: quét mọi dataset trong /kaggle/input
            gk_any = _glob.glob(os.path.join(kaggle_root, '**', f"{vid}*.avi"), recursive=True)
            if gk_any:
                return gk_any[0]
    except Exception:
        pass
    # 1) data/videos
    p1 = os.path.join('data', 'videos', f"{vid}.avi")
    if os.path.exists(p1):
        return p1
    # 1b) bất kỳ hậu tố
    import glob as _glob
    g = _glob.glob(os.path.join('data', 'videos', f"{vid}*.avi"))
    if g:
        return g[0]
    # 2) Test_Videos_New (tìm đệ quy)
    g2 = _glob.glob(os.path.join('data', 'Test_Videos_New', '**', f"{vid}*.avi"), recursive=True)
    if g2:
        return g2[0]
    return None

def _ensure_playable_video(vpath):
    """Đảm bảo đường dẫn video có thể phát inline (chuyển sang .mp4 nếu cần).
    Trả về đường dẫn có thể phát (ưu tiên .mp4). Nếu không thể chuyển, trả về None.
    """
    ext = os.path.splitext(vpath)[1].lower()
    if ext in ['.mp4', '.webm', '.ogg', '.mov']:
        return vpath
    # Chuyển đổi sang .mp4 nếu là .avi hoặc định dạng không hỗ trợ
    try:
        import subprocess, shutil
        base = os.path.splitext(os.path.basename(vpath))[0] + '.mp4'
        work_root = '/kaggle/working' if os.path.exists('/kaggle/working') else 'checkpoints'
        out_dir = os.path.join(work_root, 'converted_videos')
        os.makedirs(out_dir, exist_ok=True)
        out_path = os.path.join(out_dir, base)
        if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
            return out_path
        # Ưu tiên dùng ffmpeg nếu có
        ffmpeg_path = shutil.which('ffmpeg')
        if ffmpeg_path:
            cmd = [
                ffmpeg_path, '-y', '-i', vpath,
                '-c:v', 'libx264', '-preset', 'veryfast', '-crf', '28',
                '-pix_fmt', 'yuv420p',
                '-c:a', 'aac', '-movflags', '+faststart', out_path
            ]
            try:
                subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
                    return out_path
            except Exception:
                pass
        # Fallback: moviepy (cần ffmpeg backend)
        try:
            from moviepy.editor import VideoFileClip
            clip = VideoFileClip(vpath)
            clip.write_videofile(out_path, codec='libx264', audio_codec='aac', preset='ultrafast', verbose=False, logger=None)
            clip.close()
            if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
                return out_path
        except Exception:
            return None
    except Exception:
        return None
    return None

def _display_video_or_frames(vpath, max_frames=9):
    """Hiển thị video inline nếu có thể, nếu không sẽ hiển thị lưới khung hình."""
    try:
        from IPython.display import Video, display
    except Exception:
        Video = None
        display = print
    # Đảm bảo có đường dẫn có thể phát (chuyển đổi nếu cần)
    playable = _ensure_playable_video(vpath)
    if Video is not None and playable:
        try:
            display(Video(playable, embed=True))
            return True
        except Exception:
            print("Error display video")
            pass
    # Với .avi (không được trình duyệt hỗ trợ tốt), hiển thị lưới khung hình
    try:
        import imageio.v2 as imageio
        import matplotlib.pyplot as plt
        frames = []
        try:
            reader = imageio.get_reader(vpath)
            n = reader.count_frames()
            if n <= 0:
                # Fallback: duyệt tuần tự đến max_frames
                for i, frame in enumerate(reader):
                    frames.append(frame)
                    if len(frames) >= max_frames:
                        break
            else:
                # Lấy các chỉ số khung hình cách đều
                idxs = np.linspace(0, max(0, n - 1), num=max_frames, dtype=int)
                for idx in idxs:
                    try:
                        frame = reader.get_data(int(idx))
                        frames.append(frame)
                    except Exception:
                        pass
            reader.close()
        except Exception:
            # Fallback: dùng OpenCV nếu imageio không đọc được
            try:
                import cv2
                cap = cv2.VideoCapture(vpath)
                total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                if total and total > 0:
                    idxs = np.linspace(0, max(0, total - 1), num=max_frames, dtype=int)
                    for idx in idxs:
                        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
                        ok, frame = cap.read()
                        if ok and frame is not None:
                            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                            frames.append(frame)
                else:
                    # đọc tuần tự
                    i = 0
                    while i < max_frames:
                        ok, frame = cap.read()
                        if not ok or frame is None:
                            break
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        frames.append(frame)
                        i += 1
                cap.release()
            except Exception:
                pass
        if len(frames) == 0:
            print("⚠️ Không đọc được khung hình để hiển thị.")
            return False
        # Vẽ lưới khung hình
        cols = int(np.ceil(np.sqrt(len(frames))))
        rows = int(np.ceil(len(frames) / cols))
        plt.figure(figsize=(cols * 3, rows * 3))
        for i, frame in enumerate(frames):
            ax = plt.subplot(rows, cols, i + 1)
            ax.imshow(frame)
            ax.axis('off')
        plt.suptitle(os.path.basename(vpath))
        plt.tight_layout()
        plt.show()
        return True
    except Exception as e:
        print(f"⚠️ Không thể hiển thị khung hình: {e}")
        return False

def show_test_samples(num_samples=5):
    """Decode và hiển thị vài ví dụ từ test set: video + caption + BLEU/METEOR."""
    try:
        from nltk.translate.bleu_score import sentence_bleu
    except Exception:
        sentence_bleu = None
    # Map video_id -> path nếu có metadata
    video_paths_map = {}
    meta_path = os.path.join(PROC_DIR, 'test_metadata.json')
    if os.path.exists(meta_path):
        try:
            tm = json.load(open(meta_path))
            for vid, p in zip(tm.get('video_ids', []), tm.get('video_paths', [])):
                video_paths_map[vid] = p
        except Exception as e:
            print('⚠️  Không đọc được test_metadata.json:', e)
    # Chọn các video có sẵn features
    candidates = []
    for vid in test_ids:
        vfeat = os.path.join(PROC_DIR, 'features', 'test', f"{vid}.npy")
        if os.path.exists(vfeat):
            candidates.append(vid)
    if len(candidates) == 0:
        print("❌ Không có features cho test để hiển thị mẫu.")
        return []
    _rnd.shuffle(candidates)
    picked = candidates[:num_samples]
    results = []
    print("\n=== Sample predictions on TEST ===")
    for vid in picked:
        try:
            vf = np.load(os.path.join(PROC_DIR, 'features', 'test', f"{vid}.npy"))
            hyp = decode_with_clip_rerank(vf, beam_size=5, top_k=5)
            ref_list = annotations.get(vid, [])
            ref_tokens = [clean_text(c).split() for c in ref_list]
            hyp_tokens = hyp.split()
            bleu = sentence_bleu(ref_tokens, hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25)) if sentence_bleu else None
            meteor = meteor_score(ref_tokens, hyp_tokens) if 'meteor_score' in globals() else None
            print(f"- Video: {vid}")

            # Ưu tiên metadata, sau đó tự tìm đường dẫn
            vpath = video_paths_map.get(vid)
            if not vpath or not os.path.exists(vpath):
                vpath = _resolve_video_path(vid)

            if vpath and os.path.exists(vpath):
                print(f"  Path: {vpath}")
                _display_video_or_frames(vpath)
            else:
                print("  ⚠️ Không tìm thấy file video để hiển thị.")

            print(f"  Pred: {hyp}")
            if ref_list:
                print(f"  Ref[0]: {ref_list[0]}")
            if bleu is not None:
                print(f"  BLEU-4: {bleu:.4f}")
            if meteor is not None:
                print(f"  METEOR: {meteor:.4f}")
            results.append({
                'video_id': vid,
                'path': vpath,
                'pred': hyp,
                'refs': ref_list,
                'bleu4': float(bleu) if bleu is not None else None,
                'meteor': float(meteor) if meteor is not None else None
            })
        except Exception as e:
            print(f"  ⏭️  Bỏ qua {vid} do lỗi: {e}")
    os.makedirs('checkpoints', exist_ok=True)
    with open(os.path.join('checkpoints', 'test_samples.json'), 'w') as f:
        json.dump(results, f, indent=2)
    return results

# Tự động chạy để hiển thị vài mẫu
_ = show_test_samples(num_samples=10)


=== Sample predictions on TEST ===
- Video: m1NR0uNNs5Y_123_129
  Path: /kaggle/input/msvd-clips/YouTubeClips/m1NR0uNNs5Y_123_129.avi


  Pred: a woman is slicing a potato
  Ref[0]: someone is dicing an onion with a large knife
  BLEU-4: 0.5623
  METEOR: 0.7353
- Video: toE0QYZzJKE_1_8
  Path: /kaggle/input/msvd-clips/YouTubeClips/toE0QYZzJKE_1_8.avi


  Pred: a baby is crawling on a floor
  Ref[0]: a baby is cleaning
  BLEU-4: 0.0000
  METEOR: 0.5943
- Video: Z8rTzW9Gb6Y_116_123
  Path: /kaggle/input/msvd-clips/YouTubeClips/Z8rTzW9Gb6Y_116_123.avi


  Pred: a woman is putting on a cigarette
  Ref[0]: a woman is eating a piece of fried chicken
  BLEU-4: 0.0000
  METEOR: 0.6148
- Video: rQuNYxNmA6M_0_4
  Path: /kaggle/input/msvd-clips/YouTubeClips/rQuNYxNmA6M_0_4.avi


  Pred: two men are playing rugby
  Ref[0]: a baseball player throwing ball
  BLEU-4: 0.6687
  METEOR: 0.7938
- Video: 5CS4nLI2ZX8_50_59
  Path: /kaggle/input/msvd-clips/YouTubeClips/5CS4nLI2ZX8_50_59.avi


  Pred: a man is driving a car
  Ref[0]: two elephants appear to be pushing a convertible with a woman in the driver s seat and a man either sitting on the top of the backseat or standing on the passenger side in the front holding onto the windshield
  BLEU-4: 0.0000
  METEOR: 0.8535
- Video: cnsjm3fNEec_4_10
  Path: /kaggle/input/msvd-clips/YouTubeClips/cnsjm3fNEec_4_10.avi


  Pred: a man is eating something
  Ref[0]: a man is stuffing all kinds of food into his mouth
  BLEU-4: 0.6687
  METEOR: 0.9680
- Video: m1NR0uNNs5Y_88_94
  Path: /kaggle/input/msvd-clips/YouTubeClips/m1NR0uNNs5Y_88_94.avi


  Pred: a woman is slicing a potato
  Ref[0]: a woman thinly slices an onion with a large knife
  BLEU-4: 0.5623
  METEOR: 0.7353
- Video: 05gNigkqfNU_11_23
  Path: /kaggle/input/msvd-clips/YouTubeClips/05gNigkqfNU_11_23.avi


  Pred: a person is slicing a potato
  Ref[0]: a person pealing a potato
  BLEU-4: 0.7071
  METEOR: 0.8067
- Video: MJNStEgDKXU_3_10
  Path: /kaggle/input/msvd-clips/YouTubeClips/MJNStEgDKXU_3_10.avi


  Pred: a dog is walking
  Ref[0]: the dog sat down and barked a welcome
  BLEU-4: 1.0000
  METEOR: 0.9922
- Video: jTaLGh_MKCM_5_20
  Path: /kaggle/input/msvd-clips/YouTubeClips/jTaLGh_MKCM_5_20.avi


  Pred: a man is singing into a microphone
  Ref[0]: a soccer player tells tips about his sport
  BLEU-4: 0.0000
  METEOR: 0.8721
