In [1]:
import datasets
from transformers import (
    VideoMAEImageProcessor,
    VideoMAEForVideoClassification,
    VideoMAEConfig,
    VideoMAEModel
)

from transformers import (
    VideoMAEImageProcessor,
    VideoMAEForVideoClassification,
    VideoMAEPreTrainedModel,
)
from transformers.modeling_outputs import ImageClassifierOutput
from pathlib import Path
import os
import torch
from torch import nn
import random
from typing import *
os.environ['WANDB_DISABLED']='true'

In [2]:
import imageio
import numpy as np
from IPython.display import Image

mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]


def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)


def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename


def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)


In [3]:
class VideoMAEForMultilabelVideoClassification(VideoMAEPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.videomae = VideoMAEModel(config)

        self.fc_norm = (
            nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
        )
        self.classifier = (
            nn.Linear(config.hidden_size, config.num_labels)
            if config.num_labels > 0
            else nn.Identity()
        )
        self.criterion = nn.BCEWithLogitsLoss()
        self.post_init()


    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ImageClassifierOutput]:

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        outputs = self.videomae(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        if self.fc_norm is not None:
            sequence_output = self.fc_norm(sequence_output.mean(1))
        else:
            sequence_output = sequence_output[:, 0]

        logits = self.classifier(sequence_output)
        # logits = self.sigmoid(logits)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [4]:
model_ckpt = "MCG-NJU/videomae-base"
batch_size = 3
# target_path = "charades_HF"

In [5]:
import glob, json
with open('something_something/20bn-something-something-labels/labels/labels.json', 'r') as f:
    label2id = json.load(f)
id2label = {i: label for label, i in label2id.items()}
id2label

{'0': 'Approaching something with your camera',
 '1': 'Attaching something to something',
 '2': 'Bending something so that it deforms',
 '3': 'Bending something until it breaks',
 '4': 'Burying something in something',
 '5': 'Closing something',
 '6': 'Covering something with something',
 '7': 'Digging something out of something',
 '8': 'Dropping something behind something',
 '9': 'Dropping something in front of something',
 '10': 'Dropping something into something',
 '11': 'Dropping something next to something',
 '12': 'Dropping something onto something',
 '13': 'Failing to put something into something because something does not fit',
 '14': 'Folding something',
 '15': 'Hitting something with something',
 '16': 'Holding something',
 '17': 'Holding something behind something',
 '18': 'Holding something in front of something',
 '19': 'Holding something next to something',
 '20': 'Holding something over something',
 '21': 'Laying something on the table on its side, not upright',
 '22': '

In [6]:
config = VideoMAEConfig(
    problem_type="single_label_classification",
    num_labels=len(label2id)
)

model = VideoMAEForMultilabelVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  
)

Some weights of VideoMAEForMultilabelVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
with open('something_something/20bn-something-something-labels/labels/train.json', 'r') as f:
    train_data_paths = json.load(f)

p = lambda x: ' '.join(x.replace('[','').replace(']', '').split(' '))

train_path, train_labels = [], []
for d in train_data_paths:
    train_path.append(f"something_something/20bn-something-something-v2/{d['id']}.webm")
    train_labels.append(int(label2id[p(d['template'])]))


In [8]:
with open('something_something/20bn-something-something-labels/labels/validation.json', 'r') as f:
    train_data_paths = json.load(f)

eval_path, eval_labels = [], []
for d in train_data_paths:
    eval_path.append(f"something_something/20bn-something-something-v2/{d['id']}.webm")
    eval_labels.append(int(label2id[p(d['template'])]))

In [9]:
def collate_fn(examples):
    pixel_values = torch.stack(
        [example[0].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example[1] for example in examples]).long()
    return {"pixel_values": pixel_values, "labels": labels}

In [10]:
from video_dataset.ssv2 import SSVideoClsDataset


class DefaultConfig:
    batch_size: int = 64
    epochs: int = 30
    update_freq: int = 1
    save_ckpt_freq: int = 100
    model: str = 'vit_base_patch16_224'
    tubelet_size: int = 2
    input_size: int = 224
    fc_drop_rate: float = 0.0
    drop: float = 0.0
    attn_drop_rate: float = 0.0
    drop_path: float = 0.1
    disable_eval_during_finetuning: bool = False
    model_ema: bool = False
    model_ema_decay: float = 0.9999
    model_ema_force_cpu: bool = False
    opt: str = 'adamw'
    opt_eps: float = 1e-8
    opt_betas: List[float] = None
    clip_grad: float = None
    momentum: float = 0.9
    weight_decay: float = 0.05
    weight_decay_end: float = None
    lr: float = 1e-3
    layer_decay: float = 0.75
    warmup_lr: float = 1e-6
    min_lr: float = 1e-6
    warmup_epochs: int = 5
    warmup_steps: int = -1
    color_jitter: float = 0.4
    num_sample: int = 2
    aa: str = 'rand-m7-n4-mstd0.5-inc1'
    smoothing: float = 0.1
    train_interpolation: str = 'bicubic'
    crop_pct: float = None
    short_side_size: int = 224
    test_num_segment: int = 5
    test_num_crop: int = 3
    reprob: float = 0.25
    remode: str = 'pixel'
    recount: int = 1
    resplit: bool = False
    mixup: float = 0.8
    cutmix: float = 1.0
    cutmix_minmax: List[float] = None
    mixup_prob: float = 1.0
    mixup_switch_prob: float = 0.5
    mixup_mode: str = 'batch'
    finetune: str = ''
    model_key: str = 'model|module'
    model_prefix: str = ''
    init_scale: float = 0.001
    use_checkpoint: bool = False
    use_mean_pooling: bool = True
    use_cls: bool = True
    data_path: str = '/path/to/list_kinetics-400'
    eval_data_path: str = None
    nb_classes: int = 174
    imagenet_default_mean_and_std: bool = True
    num_segments: int = 1
    num_frames: int = 16
    sampling_rate: int = 4
    data_set: str = 'Kinetics-400'
    output_dir: str = ''
    log_dir: str = None
    device: str = 'cuda'
    seed: int = 0
    resume: str = ''
    auto_resume: bool = True
    save_ckpt: bool = True
    start_epoch: int = 0
    eval: bool = False
    dist_eval: bool = False
    num_workers: int = 10
    pin_mem: bool = True
    world_size: int = 1
    local_rank: int = -1
    dist_on_itp: bool = False
    dist_url: str = 'env://'
    enable_deepspeed: bool = False


dataset_cfg = DefaultConfig()
dataset_cfg.num_sample = 1
# dataset_cfg.height = 224
args = dataset_cfg
mode = 'train'
test_mode = False


train_dataset = SSVideoClsDataset(
    dataset_samples=train_path,
    label_array=train_labels,
    mode=mode,
    clip_len=1,
    num_segment=args.num_frames,
    test_num_segment=args.test_num_segment,
    test_num_crop=args.test_num_crop,
    num_crop=1 if not test_mode else 3,
    keep_aspect_ratio=True,
    crop_size=args.input_size,
    short_side_size=args.short_side_size,
    new_height=256,
    new_width=320,
    args=args)

mode = 'validation'
eval_dataset = SSVideoClsDataset(
    dataset_samples=eval_path,
    label_array=eval_labels,
    mode=mode,
    clip_len=1,
    num_segment=args.num_frames,
    test_num_segment=args.test_num_segment,
    test_num_crop=args.test_num_crop,
    num_crop=1 if not test_mode else 3,
    keep_aspect_ratio=True,
    crop_size=args.input_size,
    short_side_size=args.short_side_size,
    new_height=256,
    new_width=320,
    args=args)

from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    prefetch_factor=2,
    batch_size=batch_size
)
eval_loader = DataLoader(
    eval_dataset,
    prefetch_factor=2,
    batch_size=batch_size
)

In [11]:
from transformers import TrainingArguments, Trainer

model_name = model_ckpt.split("/")[-1]
new_model_name = "artifact_ssv2_original" #f"artifact_{model_name}_original"
num_epochs = 1

# args = TrainingArguments(
#     new_model_name,
#     remove_unused_columns=False,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     warmup_ratio=0.1,
#     logging_steps=10,
#     load_best_model_at_end=True,
#     # metric_for_best_model="accuracy",
#     # push_to_hub=True,
#     max_steps=(len(train_dataset)// batch_size) * num_epochs,
# )


args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="steps",
    eval_steps=10000,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    half_precision_backend=True,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy='steps',
    save_steps=10000,
    save_total_limit=3,
    max_steps=(len(train_dataset)// batch_size) * num_epochs,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions."""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


def compute_mAP(eval_pred):
    predictions = eval_pred.predictions
    references = eval_pred.label_ids

    # Apply sigmoid activation to predictions
    sigmoid_predictions = 1 / (1 + np.exp(-predictions))

    # Reshape predictions and references if necessary
    sigmoid_predictions = np.round(sigmoid_predictions)
    
    precision = 0

    for reference, prediction in zip(references, sigmoid_predictions):
        reference, prediction = reference.astype(np.int32), prediction.astype(np.int32)
        relevant_pred = prediction[reference!=0]
        precision +=  np.sum(relevant_pred)/len(relevant_pred)

    # recall
    return {'mAP': 100*round(precision/len(references), 4)}

In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [14]:
train_results = trainer.train()

{'loss': 3.5794, 'learning_rate': 2.628026759812918e-05, 'epoch': 0.53}
{'loss': 3.7877, 'learning_rate': 2.627040041047501e-05, 'epoch': 0.53}
{'loss': 3.8861, 'learning_rate': 2.6260533222820833e-05, 'epoch': 0.53}
{'loss': 3.5761, 'learning_rate': 2.625066603516666e-05, 'epoch': 0.53}
{'loss': 3.726, 'learning_rate': 2.6240798847512482e-05, 'epoch': 0.53}
{'loss': 3.6349, 'learning_rate': 2.6230931659858308e-05, 'epoch': 0.53}
{'loss': 2.9186, 'learning_rate': 2.622106447220413e-05, 'epoch': 0.53}
{'loss': 3.769, 'learning_rate': 2.621119728454996e-05, 'epoch': 0.53}
{'loss': 3.4895, 'learning_rate': 2.6201330096895783e-05, 'epoch': 0.53}
{'loss': 3.139, 'learning_rate': 2.619146290924161e-05, 'epoch': 0.53}
{'loss': 3.5486, 'learning_rate': 2.6181595721587432e-05, 'epoch': 0.53}
{'loss': 2.5559, 'learning_rate': 2.6171728533933258e-05, 'epoch': 0.53}
{'loss': 3.193, 'learning_rate': 2.6161861346279088e-05, 'epoch': 0.53}
{'loss': 2.8608, 'learning_rate': 2.615199415862491e-05, 'epo

  0%|          | 0/8259 [00:00<?, ?it/s]

{'eval_loss': 3.0359280109405518, 'eval_accuracy': 0.2752956370827784, 'eval_runtime': 3078.4154, 'eval_samples_per_second': 8.049, 'eval_steps_per_second': 2.683, 'epoch': 0.53}
{'loss': 2.9614, 'learning_rate': 2.5944783217887238e-05, 'epoch': 0.53}
{'loss': 3.081, 'learning_rate': 2.5934916030233064e-05, 'epoch': 0.53}
{'loss': 3.1394, 'learning_rate': 2.592504884257889e-05, 'epoch': 0.53}
{'loss': 3.5911, 'learning_rate': 2.5915181654924713e-05, 'epoch': 0.53}
{'loss': 3.144, 'learning_rate': 2.5905314467270542e-05, 'epoch': 0.53}
{'loss': 3.6855, 'learning_rate': 2.5895447279616365e-05, 'epoch': 0.53}
{'loss': 3.0576, 'learning_rate': 2.588558009196219e-05, 'epoch': 0.53}
{'loss': 3.485, 'learning_rate': 2.5875712904308014e-05, 'epoch': 0.53}
{'loss': 3.4535, 'learning_rate': 2.586584571665384e-05, 'epoch': 0.53}
{'loss': 2.9369, 'learning_rate': 2.5855978528999663e-05, 'epoch': 0.53}
{'loss': 3.0189, 'learning_rate': 2.5846111341345492e-05, 'epoch': 0.53}
{'loss': 3.2426, 'learni

  0%|          | 0/8259 [00:00<?, ?it/s]

{'eval_loss': 2.7025187015533447, 'eval_accuracy': 0.33494773378536546, 'eval_runtime': 2866.9333, 'eval_samples_per_second': 8.642, 'eval_steps_per_second': 2.881, 'epoch': 0.71}
{'loss': 3.1355, 'learning_rate': 1.607759556371243e-05, 'epoch': 0.71}
{'loss': 3.2961, 'learning_rate': 1.6067728376058256e-05, 'epoch': 0.71}
{'loss': 3.0363, 'learning_rate': 1.6057861188404082e-05, 'epoch': 0.71}
{'loss': 3.3623, 'learning_rate': 1.604799400074991e-05, 'epoch': 0.71}
{'loss': 2.3516, 'learning_rate': 1.6038126813095735e-05, 'epoch': 0.71}
{'loss': 2.8801, 'learning_rate': 1.6028259625441557e-05, 'epoch': 0.71}
{'loss': 3.3085, 'learning_rate': 1.6018392437787383e-05, 'epoch': 0.71}
{'loss': 3.0892, 'learning_rate': 1.6008525250133206e-05, 'epoch': 0.71}
{'loss': 2.7359, 'learning_rate': 1.5998658062479032e-05, 'epoch': 0.71}
{'loss': 3.0705, 'learning_rate': 1.598879087482486e-05, 'epoch': 0.71}
{'loss': 3.3058, 'learning_rate': 1.597892368717068e-05, 'epoch': 0.71}
{'loss': 2.3708, 'lea

  0%|          | 0/8259 [00:00<?, ?it/s]

{'eval_loss': 2.52903151512146, 'eval_accuracy': 0.3686079832102353, 'eval_runtime': 3076.6113, 'eval_samples_per_second': 8.053, 'eval_steps_per_second': 2.684, 'epoch': 0.89}
{'loss': 2.4886, 'learning_rate': 6.210407909537623e-06, 'epoch': 0.89}
{'loss': 2.184, 'learning_rate': 6.200540721883449e-06, 'epoch': 0.89}
{'loss': 2.1091, 'learning_rate': 6.190673534229275e-06, 'epoch': 0.89}
{'loss': 3.0413, 'learning_rate': 6.180806346575099e-06, 'epoch': 0.89}
{'loss': 2.7629, 'learning_rate': 6.170939158920925e-06, 'epoch': 0.89}
{'loss': 2.9879, 'learning_rate': 6.1610719712667496e-06, 'epoch': 0.89}
{'loss': 2.6421, 'learning_rate': 6.151204783612575e-06, 'epoch': 0.89}
{'loss': 3.9198, 'learning_rate': 6.1413375959584e-06, 'epoch': 0.89}
{'loss': 2.1575, 'learning_rate': 6.131470408304225e-06, 'epoch': 0.89}
{'loss': 2.3583, 'learning_rate': 6.1216032206500506e-06, 'epoch': 0.89}
{'loss': 3.2054, 'learning_rate': 6.111736032995876e-06, 'epoch': 0.89}
{'loss': 2.3816, 'learning_rate'

In [16]:
trainer.evaluate(eval_dataset)

  0%|          | 0/8259 [00:00<?, ?it/s]

In [None]:
# trainer.save_model()
# test_results = trainer.evaluate(test_dataset)
# trainer.log_metrics("test", test_results)
# trainer.save_metrics("test", test_results)
# trainer.save_state()
# trained_model = VideoMAEForMultilabelVideoClassification.from_pretrained("/mnt/JaHiD/Zahid/RnD/VideoMAEClassification/artifact_charades_original/checkpoint-4792")

In [None]:
def run_inference(model, video):
    """Utility to run inference given a model and test video.

    The video is assumed to be preprocessed already.
    """
    # (num_frames, num_channels, height, width)
    perumuted_sample_test_video = video.permute(1, 0, 2, 3)

    inputs = {
        "pixel_values": perumuted_sample_test_video.unsqueeze(0),
        # "labels": None#torch.zeros(len(id2label)).unsqueeze(0),
    }
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model = model.to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    return logits

In [None]:
for d in eval_dataset:
    sample_video = d
    break
logits = run_inference(model, sample_video['video'])