In [1]:
%env CUDA_VISIBLE_DEVICES 0

import json
import os
import random

from datasets import load_from_disk
import evaluate
import numpy as np
import torch
from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback

device = "cuda:0" if torch.cuda.is_available() else "cpu"
if device == "cuda:0":
    torch.backends.cuda.matmul.allow_tf32 = True
    n_gpus = torch.cuda.device_count()
    print("GPUs:")
    for i in range(n_gpus):
        print(torch.cuda.get_device_name(i))

env: CUDA_VISIBLE_DEVICES=0
GPUs:
NVIDIA RTX A6000


In [2]:
# TimeSformer
encoder = "facebook/timesformer-base-finetuned-k600"
decoder = "gpt2"

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained(decoder)
tokenizer.pad_token = tokenizer.eos_token

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder).to(device)
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 50
model.config.num_beams = 4
model.config.early_stopping = True

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k600 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.2.ln_cross_attn.weight', 'h.11.crossattention.masked_bias', 'h.4.crossa

In [3]:
dataset = load_from_disk("dataset/processed/k600_16frames")
dataset.set_format("torch")

dataset["train"] = dataset["train"].select(np.arange(6))
dataset["validation"] = dataset["validation"].select(np.arange(6))

dataset

DatasetDict({
    train: Dataset({
        features: ['video_id', 'pixel_values', 'labels'],
        num_rows: 6
    })
    validation: Dataset({
        features: ['video_id', 'pixel_values', 'labels'],
        num_rows: 6
    })
})

In [4]:
output_dir="training/random_frames"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    tf32=True,
    predict_with_generate=True,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    dataloader_num_workers=8,
    num_train_epochs=100,
    learning_rate=5e-7,
)

def collator(examples):
    pixel_values, labels = [], []
    for example in examples:
        # train
        if len(example["pixel_values"]) == 16:
            frame_idxs = []
            for i in range(0, 16, 2):
                frame_idxs.append(i + random.randint(0, 1))
            pixel_values.append(example["pixel_values"][frame_idxs])
        # val
        else:
            pixel_values.append(example["pixel_values"])
        labels.append(example["labels"])

    pixel_values = torch.stack(pixel_values)
    labels = torch.stack(labels)
    return {"pixel_values": pixel_values, "labels": labels}

bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")
val_output = {}
with open("dataset/longestCaption_videoID.json") as file:
    longestCaption_videoID = json.load(file)
with open("dataset/videoID_captions.json") as file:
    videoID_captions = json.load(file)
    
def metrics(eval_predictions):
    predictions, labels = eval_predictions
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    references = []
    for i, label in enumerate(labels):
        video_id = longestCaption_videoID[label]
        references.append(videoID_captions[video_id])
        if video_id in val_output:
            val_output[video_id].append(predictions[i])
        else:
            val_output[video_id] = [predictions[i]]
            
    for reference in references:
        print(reference)
            
    bleu_scores = bleu.compute(predictions=predictions, references=references, smooth=True)
    meteor_scores = meteor.compute(predictions=predictions, references=references)
    rouge_scores = rouge.compute(predictions=predictions, references=references, rouge_types=['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)    
    return {"bleu": bleu_scores["bleu"], "meteor": meteor_scores["meteor"], "rouge1": rouge_scores["rouge1"], "rouge2": rouge_scores["rouge2"], "rougeL": rouge_scores["rougeL"]}

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=collator,
    compute_metrics=metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()

with open(os.path.join(output_dir, "val_output.json"), "w") as file:
    file.write(json.dumps(val_output))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/922201615/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/922201615/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/922201615/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
The following columns in the training set don't have a corresponding argument in `VisionEncoderDecoderModel.forward` and have been ignored: video_id. If video_id are not expected by `VisionEncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6
  Num Epochs = 1
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 1
  Number of trainable parameters = 274065408


Epoch,Training Loss,Validation Loss,Bleu,Meteor,Rouge1,Rouge2,Rougel
1,7.0524,7.36459,0.010654,0.048474,0.005848,0.0,0.005848


The following columns in the evaluation set don't have a corresponding argument in `VisionEncoderDecoderModel.forward` and have been ignored: video_id. If video_id are not expected by `VisionEncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 6
  Batch size = 6
Generate config GenerationConfig {
  "decoder_start_token_id": 50256,
  "early_stopping": true,
  "max_length": 50,
  "num_beams": 4,
  "pad_token_id": 50256,
  "transformers_version": "4.26.1"
}



['A girl staring at the camera while she takes her hand and places her index finger on her nose', 'A woman is pressing her nose with her finger and then she smiles a few times.', 'A young woman stares into the camera smiling and pressing down her nose with one finger.', 'A girl is smiling and playing with her nose as music plays in the background.', 'A young girl smiles and presses the end of her nose with her index finger.', 'A girl holds her finger to her nose then puts it down as she smiles.', 'A girl listening to music and touching her nose and smiling.', 'Girl listens to music, show reaction, covers nostril, smile, uncovers nostril', 'A selfie video of young person sitting and listening to music.', 'A young asian girl is making faces into her phone.']
['A person with a faint moustache and thick black hair is fiddling with the quiff of his hair at the front while staring into a camera.', 'A young man is taking a selfie video of him playing with his black hair.', 'A man plays with h

Saving model checkpoint to training/random_frames/checkpoint-1
Configuration saved in training/random_frames/checkpoint-1/config.json
Configuration saved in training/random_frames/checkpoint-1/generation_config.json
Model weights saved in training/random_frames/checkpoint-1/pytorch_model.bin
tokenizer config file saved in training/random_frames/checkpoint-1/tokenizer_config.json
Special tokens file saved in training/random_frames/checkpoint-1/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from training/random_frames/checkpoint-1 (score: 7.364589691162109).
