In [1]:
import json
import os
import sys

from datasets import load_from_disk
import evaluate
import numpy as np
import torch
#from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator, EarlyStoppingCallback

sys.path.append("../transformers/src")
from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator, EarlyStoppingCallback

device = "cuda:0" if torch.cuda.is_available() else "cpu"
if device == "cuda:0":
    torch.backends.cuda.matmul.allow_tf32 = True
    
    n_gpus = torch.cuda.device_count()
    print("GPUs:")
    for i in range(n_gpus):
        print(torch.cuda.get_device_name(i))

GPUs:
NVIDIA RTX A6000
NVIDIA GeForce RTX 2080 Ti
NVIDIA GeForce RTX 2080 Ti
NVIDIA GeForce RTX 2080 Ti
NVIDIA GeForce RTX 2080 Ti


In [2]:
# TimeSformer
encoder = "facebook/timesformer-base-finetuned-k600"
decoder = "gpt2"

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained(decoder)
tokenizer.pad_token = tokenizer.eos_token

kwargs = {
    "encoder_hidden_dropout_prob": 0.25,
}

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder, **kwargs).to(device)
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 50
model.config.num_beams = 4
model.config.early_stopping = True

# VideoMAE
# encoder = "MCG-NJU/videomae-base"
# decoder = "gpt2"

# image_processor = AutoImageProcessor.from_pretrained(encoder)
# tokenizer = AutoTokenizer.from_pretrained(decoder)
# model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder).to(device)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k600 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.11.crossattention.masked_bias', 'h.10.ln_cross_attn.weight', 'h.7.cross

In [3]:
dataset = load_from_disk("dataset/processed/k600")

dataset["train"] = dataset["train"].select(np.arange(5))
dataset["validation"] = dataset["validation"].select(np.arange(5))

dataset

DatasetDict({
    train: Dataset({
        features: ['videoID', 'pixel_values', 'labels'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['videoID', 'pixel_values', 'labels'],
        num_rows: 5
    })
})

In [4]:
train_output, val_output = {}, {}
output_dir = "training/test"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    tf32=True,
    predict_with_generate=True,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    dataloader_num_workers=8,
    num_train_epochs=1,
)

metric = evaluate.load("rouge")
def metrics(eval_preds):
    preds, labels = eval_preds
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    for i, label in enumerate(labels):
        if label in val_output:
            val_output[label].append(preds[i])
        else:
            val_output[label] = [preds[i]]
    
    return metric.compute(predictions=preds, references=labels)

trainer = Seq2SeqTrainer(
    train_output,
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=default_data_collator,
    compute_metrics=metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `VisionEncoderDecoderModel.forward` and have been ignored: videoID. If videoID are not expected by `VisionEncoderDecoderModel.forward`,  you can safely ignore this message.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running training *****
  Num examples = 5
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 5
  Gradient Accumulation steps = 1
  Total optimization steps = 1
  Number of trainable parameters = 274065408
Generate config GenerationConfig {
  "decoder_start_token_id": 50256,
  "early_stopping": true,
  "max_length": 50,
  "num_beams": 4,
  "pad_token_id": 50256,
  "transformers_version": "4

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,8.286,5.555237,0.0,0.0,0.0,0.0


The following columns in the evaluation set don't have a corresponding argument in `VisionEncoderDecoderModel.forward` and have been ignored: videoID. If videoID are not expected by `VisionEncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5
  Batch size = 5
Generate config GenerationConfig {
  "decoder_start_token_id": 50256,
  "early_stopping": true,
  "max_length": 50,
  "num_beams": 4,
  "pad_token_id": 50256,
  "transformers_version": "4.26.1"
}

Saving model checkpoint to training/test/checkpoint-1
Configuration saved in training/test/checkpoint-1/config.json
Configuration saved in training/test/checkpoint-1/generation_config.json
Model weights saved in training/test/checkpoint-1/pytorch_model.bin
tokenizer config file saved in training/test/checkpoint-1/tokenizer_config.json
Special tokens file saved in training/test/checkpoint-1/special_tokens_map.json


Training completed. Do not forget to share your model on hugg

TrainOutput(global_step=1, training_loss=8.286035537719727, metrics={'train_runtime': 24.8875, 'train_samples_per_second': 0.201, 'train_steps_per_second': 0.04, 'total_flos': 8478276452352000.0, 'train_loss': 8.286035537719727, 'epoch': 1.0})

In [5]:
with open(os.path.join(output_dir, "train_output.json"), "w") as file:
    file.write(json.dumps(train_output))
    
with open(os.path.join(output_dir, "val_output.json"), "w") as file:
    file.write(json.dumps(val_output))