In [1]:
import json
import os
import random

from datasets import load_dataset
import evaluate
import numpy as np
import torch
from torchvision import transforms as T
from torchvision.io import VideoReader
from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator

torch.backends.cuda.matmul.allow_tf32 = True
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: %s" % device)

FRAMES_PER_VIDEO = 8

Device: cuda


In [2]:
# TimeSformer
encoder = "facebook/timesformer-base-finetuned-k600"
decoder = "gpt2"

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained(decoder)
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder).to(device)

tokenizer.pad_token = tokenizer.eos_token
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

print(model.num_parameters())

# VideoMAE
# encoder = "MCG-NJU/videomae-base"
# decoder = "gpt2"

# image_processor = AutoImageProcessor.from_pretrained(encoder)
# tokenizer = AutoTokenizer.from_pretrained(decoder)
# model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder).to(device)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k600 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.2.ln_cross_attn.weight', 'h.11.crossattention.c_attn.weight', 'h.4.cros

274065408


In [18]:
def preprocess(example):
    video_id = example["videoID"]
    captions = example["enCap"]
    
    videos_path = "dataset/videos"
    video_path = os.path.join(videos_path, "%s.mp4" % video_id)
    if not os.path.isfile(video_path):
        video_path = os.path.join(videos_path, "%s.webm" % video_id)
    
    reader = VideoReader(video_path)
    
    try:
        duration = reader.get_metadata()["video"]["duration"][0]
    except:
        print(video_path)
        
    step = duration / FRAMES_PER_VIDEO

    frames = []
    for i in range(FRAMES_PER_VIDEO):
        # fixed intervals
        timestamp = i * step
        
        # sample intervals
        #timestamp = random.uniform(i * step, (i + 1) * step) 
        
        reader.seek(timestamp)
        frames.append(next(reader)["data"])
        
    pixel_values = image_processor(frames, return_tensors="pt").pixel_values
    
    # first caption
    #caption = captions[0]
    
    # random caption
    caption = captions[random.randint(0, 9)]
    #labels = torch.LongTensor(tokenizer(caption, padding="max_length").input_ids)
    labels = tokenizer(caption, padding="max_length").input_ids
    
    return {"pixel_values": pixel_values[0], "labels": labels}


data_files = {"train": "dataset/vatex_train_captions.json", "validation": "dataset/vatex_val_captions.json"}
dataset = load_dataset("json", data_files=data_files)
dataset = dataset.with_format("torch", device=device)
dataset = dataset.remove_columns("chCap")

dataset["train"] = dataset["train"].select([x for x in range(50)])
dataset["validation"] = dataset["validation"].select([x for x in range(5)])

dataset = dataset.map(function=preprocess, remove_columns=dataset["train"].column_names)
dataset

Using custom data configuration default-d253243028bf9773
Found cached dataset json (/home/922201615/.cache/huggingface/datasets/json/default-d253243028bf9773/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?ex/s]

  0%|          | 0/5 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['pixel_values', 'labels'],
        num_rows: 50
    })
    validation: Dataset({
        features: ['pixel_values', 'labels'],
        num_rows: 5
    })
})

In [19]:
dataset.save_to_disk("small")
# dataset = load_dataset("small.hf")
# dataset

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

In [17]:
class VatexDataset(torch.utils.data.Dataset):
    def __init__(self, videos_path, json_path, num_frames, transforms=None):
        self.video_names = []
        for _, _, files in os.walk(videos_path):
            for file in files:
                self.video_names.append(file)
        
        self.video_captions = {}
        with open(json_path) as json_file:
            json_data = json.load(json_file)            
            for data in json_data:
                video_id = data["videoID"]
                captions = data["enCap"]
                self.video_captions[video_id] = captions
                
        self.videos_path = videos_path
        self.num_frames = num_frames
        self.transforms = transforms
        

    def __len__(self):
        return len(self.video_names) * 10
    
    
    def __getitem__(self, idx):        
        video_idx = idx % len(self.video_names)
        caption_idx = idx // len(self.video_names)
        
        video_name = self.video_names[video_idx]
        video_id = video_name.split(".")[0]
        captions = self.video_captions[video_id]
        caption = captions[caption_idx]
        
        video_path = os.path.join(self.videos_path, video_name)
        reader = VideoReader(video_path, "video")
        
        duration = reader.get_metadata()["video"]["duration"][0]
        step = duration / self.num_frames
        
        frames = []
        for i in range(self.num_frames):
            #timestamp = i * step # fixed intervals
            timestamp = random.uniform(i * step, (i + 1) * step) # sample intervals
            reader.seek(timestamp)
            frames.append(next(reader)["data"])

        return (frames, caption)

In [None]:
num_frames = 8

transforms_train = T.Compose([
    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    T.RandomPerspective(),
    T.RandomRotation(15),
    T.RandomAdjustSharpness(2),
    T.RandomAutocontrast(),
    T.ToTensor(),
    T.Normalize(image_processor.image_mean, image_processor.image_std)
])

transforms_val = T.Compose([
    T.ToTensor(),
    T.Normalize(image_processor.image_mean, image_processor.image_std),
])

data_train = VatexDataset("dataset/vatex_train_videos", "dataset/vatex_train_captions.json", num_frames, transforms_train)
data_val = VatexDataset("dataset/vatex_val_videos", "dataset/vatex_val_captions.json", num_frames, transforms_val)

data_train = torch.utils.data.Subset(data_train, list(range(100)))
data_val = torch.utils.data.Subset(data_val, list(range(10)))

print(len(data_train))
print(len(data_val))

In [21]:
def collate_fn(examples):
    pixel_values, labels = [], []
    for frames, caption in examples:
        pixel_values.append(image_processor(frames, return_tensors="pt").pixel_values)
        labels.append(torch.LongTensor(tokenizer(caption, padding="max_length").input_ids))
    
    pixel_values = torch.cat(pixel_values)
    labels = torch.stack(labels)
    
    return {"pixel_values": pixel_values, "labels": labels}

metric = evaluate.load("rouge")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=preds, references=labels)

training_args = Seq2SeqTrainingArguments(
    output_dir="training",
    tf32=True,
    dataloader_pin_memory=False,
    predict_with_generate=True,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=image_processor,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=default_data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 50
  Num Epochs = 5
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 5
  Gradient Accumulation steps = 1
  Total optimization steps = 50
  Number of trainable parameters = 274065408


Epoch,Training Loss,Validation Loss
1,2.0235,0.079734
2,0.0985,0.06747
3,0.0619,0.062521
4,0.0538,0.060338
5,0.0502,0.059808


***** Running Evaluation *****
  Num examples = 5
  Batch size = 5
Saving model checkpoint to trainer/checkpoint-10
Configuration saved in trainer/checkpoint-10/config.json
Configuration saved in trainer/checkpoint-10/generation_config.json
Model weights saved in trainer/checkpoint-10/pytorch_model.bin
Image processor saved in trainer/checkpoint-10/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 5
  Batch size = 5
Saving model checkpoint to trainer/checkpoint-20
Configuration saved in trainer/checkpoint-20/config.json
Configuration saved in trainer/checkpoint-20/generation_config.json
Model weights saved in trainer/checkpoint-20/pytorch_model.bin
Image processor saved in trainer/checkpoint-20/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 5
  Batch size = 5
Saving model checkpoint to trainer/checkpoint-30
Configuration saved in trainer/checkpoint-30/config.json
Configuration saved in trainer/checkpoint-30/generation_config.json
Model w

TrainOutput(global_step=50, training_loss=0.45756843090057375, metrics={'train_runtime': 106.3321, 'train_samples_per_second': 2.351, 'train_steps_per_second': 0.47, 'total_flos': 4.239138226176e+17, 'train_loss': 0.45756843090057375, 'epoch': 5.0})

In [28]:
image_processor = AutoImageProcessor.from_pretrained("training/checkpoint-50")
model = VisionEncoderDecoderModel.from_pretrained("training/checkpoint-50")

data = dataset["train"][0]
pixel_values = data["pixel_values"]
pixel_values = torch.unsqueeze(pixel_values, 0).to("cpu")
labels = data["labels"]

label_text = tokenizer.decode(labels, skip_special_tokens=True)
print(label_text)

generated_ids = model.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

loading configuration file training/checkpoint-50/preprocessor_config.json
Image processor VideoMAEImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "VideoMAEImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

loading configuration file training/checkpoint-50/config.json
Model config VisionEncoderDecoderConfig {
  "_commit_hash": null,
  "architectures": [
    "VisionEncoderDecoderModel"
  ],
  "decoder": {
    "_name_or_path": "gpt2",
    "activation_function": "gelu_new",
    "add_cross_attention": true,
    "architectures": [
      "GPT2LMHeadModel"
    ],
    "attn_pdrop": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": 50256,
  

A man is strapping a woman into a harness and she climbs a wall.




A man is standing on a rope and holding a rope.
