In [1]:
import datasets
from transformers import (
    VideoMAEImageProcessor,
    VideoMAEForVideoClassification,
    VideoMAEConfig,
    VideoMAEModel
)

from transformers import (
    VideoMAEImageProcessor,
    VideoMAEForVideoClassification,
    VideoMAEPreTrainedModel,
)
from transformers.modeling_outputs import ImageClassifierOutput
from pathlib import Path
import os
import torch
from torch import nn
import random
from typing import *
os.environ['WANDB_DISABLED']='true'

In [2]:
import imageio
import numpy as np
from IPython.display import Image

mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]


def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)


def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.

    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename


def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)


In [3]:
class VideoMAEForMultilabelVideoClassification(VideoMAEPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.videomae = VideoMAEModel(config)

        self.fc_norm = (
            nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
        )
        self.classifier = (
            nn.Linear(config.hidden_size, config.num_labels)
            if config.num_labels > 0
            else nn.Identity()
        )
        self.criterion = nn.BCEWithLogitsLoss()
        self.post_init()


    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ImageClassifierOutput]:

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )
        outputs = self.videomae(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        if self.fc_norm is not None:
            sequence_output = self.fc_norm(sequence_output.mean(1))
        else:
            sequence_output = sequence_output[:, 0]

        logits = self.classifier(sequence_output)
        # logits = self.sigmoid(logits)

        loss = None
        if labels is not None:
            ''' # for multi-label
            # logits = logits.reshape(-1, 1)
            # labels = labels.reshape(-1, 1)
            loss = self.criterion(logits, labels)#.reshape(-1)
            
            # possitive_loss = loss[labels.reshape(-1).long()==1].mean()
            # negative_loss = loss[labels.reshape(-1).long()!=1].mean()

            # loss = possitive_loss* 10 + negative_loss * 1
            # print('Positive loss: ',possitive_loss.item())
            # print('Negative loss: ',negative_loss.item())

            '''
            # print(logits.shape, labels.shape, self.num_labels)

            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [4]:
model_ckpt = "MCG-NJU/videomae-base"
batch_size = 3
target_path = "charades_HF"

In [5]:
import glob
classes = sorted([g.split('/')[-1] for g in glob.glob("/mnt/JaHiD/Zahid/RnD/VideoMAEClassification/VideoMAE/UCF101/train/*")])

label2id = {label: i for i, label in enumerate(classes)}
id2label = {i: label for label, i in label2id.items()}

In [6]:
config = VideoMAEConfig(
    problem_type="single_label_classification",
    num_labels=len(label2id)
)

# image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
model = VideoMAEForMultilabelVideoClassification.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  
)

Some weights of VideoMAEForMultilabelVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_path = glob.glob("/mnt/JaHiD/Zahid/RnD/VideoMAEClassification/VideoMAE/UCF101/train/**/*.avi", recursive=True)
train_labels = [label2id[l.split('/')[-2]] for l in train_path ]

In [8]:
eval_path = glob.glob("/mnt/JaHiD/Zahid/RnD/VideoMAEClassification/VideoMAE/UCF101/val/**/*.avi", recursive=True)
eval_labels = [label2id[l.split('/')[-2]] for l in eval_path ]

In [9]:
def collate_fn(examples):
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.stack([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [10]:
from dataset_config import DefaultConfig
from video_dataset.charades import VideoClsDataset

dataset_cfg = DefaultConfig()
dataset_cfg.num_sample = 1
dataset_cfg.height = 224

train_dataset = VideoClsDataset(
    data_path='',
    dataset_samples=train_path,
    label_array=train_labels,
    n_class=len(label2id),
    label2id=label2id,
    id2label=id2label,
    mode="validation",
    clip_len=16,
    frame_sample_rate=1,
    crop_size=224,
    short_side_size=224,
    new_height=224,
    new_width=224,
    keep_aspect_ratio=True,
    num_segment=1,
    num_crop=1,
    test_num_segment=10,
    test_num_crop=3,
    args=dataset_cfg,
    single_label_classification=True
)

eval_dataset = VideoClsDataset(
    data_path='',
    dataset_samples=eval_path[:-1],
    label_array=eval_labels[:-1],
    n_class=len(label2id),
    label2id=label2id,
    id2label=id2label,
    mode="validation",
    clip_len=16,
    frame_sample_rate=1,
    crop_size=224,
    short_side_size=224,
    # new_height=224,
    # new_width=224,
    keep_aspect_ratio=True,
    num_segment=1,
    num_crop=1,
    test_num_segment=10,
    test_num_crop=3,
    args=dataset_cfg,
    single_label_classification=True
)

from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    prefetch_factor=2,
    batch_size=batch_size
)

eval_loader = DataLoader(
    eval_dataset,
    prefetch_factor=2,
    batch_size=batch_size
)

In [11]:
from transformers import TrainingArguments, Trainer

model_name = model_ckpt.split("/")[-1]
new_model_name = "artifact_ucf101_original" #f"artifact_{model_name}_original"
num_epochs = 1

# args = TrainingArguments(
#     new_model_name,
#     remove_unused_columns=False,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     warmup_ratio=0.1,
#     logging_steps=10,
#     load_best_model_at_end=True,
#     # metric_for_best_model="accuracy",
#     # push_to_hub=True,
#     max_steps=(len(train_dataset)// batch_size) * num_epochs,
# )


args = TrainingArguments(
    new_model_name,
    remove_unused_columns=False,
    evaluation_strategy="steps",
    eval_steps=1000,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    half_precision_backend=True,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy='steps',
    save_steps=1000,
    save_total_limit=3,
    max_steps=(len(train_dataset)// batch_size) * num_epochs,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
import evaluate
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions."""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


def compute_mAP(eval_pred):
    predictions = eval_pred.predictions
    references = eval_pred.label_ids

    # Apply sigmoid activation to predictions
    sigmoid_predictions = 1 / (1 + np.exp(-predictions))

    # Reshape predictions and references if necessary
    sigmoid_predictions = np.round(sigmoid_predictions)
    
    precision = 0

    for reference, prediction in zip(references, sigmoid_predictions):
        reference, prediction = reference.astype(np.int32), prediction.astype(np.int32)
        relevant_pred = prediction[reference!=0]
        precision +=  np.sum(relevant_pred)/len(relevant_pred)

    # recall
    return {'mAP': 100*round(precision/len(references), 4)}

In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [14]:
train_results = trainer.train()

  0%|          | 0/3351 [00:00<?, ?it/s]

{'loss': 4.7123, 'learning_rate': 1.4880952380952381e-06, 'epoch': 0.0}
{'loss': 4.761, 'learning_rate': 2.9761904761904763e-06, 'epoch': 0.01}
{'loss': 4.6791, 'learning_rate': 4.464285714285715e-06, 'epoch': 0.01}
{'loss': 4.637, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.01}
{'loss': 4.7042, 'learning_rate': 7.4404761904761905e-06, 'epoch': 0.01}
{'loss': 4.7322, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.02}
{'loss': 4.6069, 'learning_rate': 1.0416666666666668e-05, 'epoch': 0.02}
{'loss': 4.7029, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.02}
{'loss': 4.7683, 'learning_rate': 1.3392857142857144e-05, 'epoch': 0.03}
{'loss': 4.7285, 'learning_rate': 1.4880952380952381e-05, 'epoch': 0.03}
{'loss': 4.6699, 'learning_rate': 1.636904761904762e-05, 'epoch': 0.03}
{'loss': 4.595, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.04}
{'loss': 4.6851, 'learning_rate': 1.9345238095238097e-05, 'epoch': 0.04}
{'loss': 4.7348, 'learning_rate': 2.0833333333333336e-05, 'e

  0%|          | 0/558 [00:00<?, ?it/s]

{'eval_loss': 3.2610769271850586, 'eval_accuracy': 0.2458133971291866, 'eval_runtime': 204.5241, 'eval_samples_per_second': 8.175, 'eval_steps_per_second': 2.728, 'epoch': 0.3}
{'loss': 3.2251, 'learning_rate': 3.882255389718076e-05, 'epoch': 0.3}
{'loss': 2.7431, 'learning_rate': 3.8656716417910446e-05, 'epoch': 0.3}
{'loss': 3.007, 'learning_rate': 3.849087893864014e-05, 'epoch': 0.31}
{'loss': 3.1206, 'learning_rate': 3.832504145936982e-05, 'epoch': 0.31}
{'loss': 3.2473, 'learning_rate': 3.8159203980099506e-05, 'epoch': 0.31}
{'loss': 3.548, 'learning_rate': 3.799336650082919e-05, 'epoch': 0.32}
{'loss': 3.2106, 'learning_rate': 3.7827529021558875e-05, 'epoch': 0.32}
{'loss': 3.0388, 'learning_rate': 3.766169154228856e-05, 'epoch': 0.32}
{'loss': 3.0767, 'learning_rate': 3.7495854063018244e-05, 'epoch': 0.33}
{'loss': 2.5688, 'learning_rate': 3.733001658374793e-05, 'epoch': 0.33}
{'loss': 3.0165, 'learning_rate': 3.716417910447761e-05, 'epoch': 0.33}
{'loss': 2.9376, 'learning_rate

  0%|          | 0/558 [00:00<?, ?it/s]

{'eval_loss': 1.7234128713607788, 'eval_accuracy': 0.6429425837320574, 'eval_runtime': 198.6125, 'eval_samples_per_second': 8.418, 'eval_steps_per_second': 2.809, 'epoch': 0.6}
{'loss': 1.8662, 'learning_rate': 2.2238805970149256e-05, 'epoch': 0.6}
{'loss': 1.6011, 'learning_rate': 2.207296849087894e-05, 'epoch': 0.6}
{'loss': 1.4556, 'learning_rate': 2.1907131011608624e-05, 'epoch': 0.61}
{'loss': 1.3368, 'learning_rate': 2.174129353233831e-05, 'epoch': 0.61}
{'loss': 1.7343, 'learning_rate': 2.1575456053067993e-05, 'epoch': 0.61}
{'loss': 1.6489, 'learning_rate': 2.140961857379768e-05, 'epoch': 0.61}
{'loss': 1.6073, 'learning_rate': 2.1243781094527365e-05, 'epoch': 0.62}
{'loss': 1.5042, 'learning_rate': 2.107794361525705e-05, 'epoch': 0.62}
{'loss': 1.8911, 'learning_rate': 2.0912106135986734e-05, 'epoch': 0.62}
{'loss': 1.346, 'learning_rate': 2.074626865671642e-05, 'epoch': 0.63}
{'loss': 1.8264, 'learning_rate': 2.0580431177446106e-05, 'epoch': 0.63}
{'loss': 1.7361, 'learning_r

  0%|          | 0/558 [00:00<?, ?it/s]

{'eval_loss': 1.0339295864105225, 'eval_accuracy': 0.8020334928229665, 'eval_runtime': 197.6047, 'eval_samples_per_second': 8.461, 'eval_steps_per_second': 2.824, 'epoch': 0.89}
{'loss': 0.9396, 'learning_rate': 5.655058043117745e-06, 'epoch': 0.9}
{'loss': 1.2238, 'learning_rate': 5.489220563847429e-06, 'epoch': 0.9}
{'loss': 0.5737, 'learning_rate': 5.323383084577115e-06, 'epoch': 0.9}
{'loss': 1.0043, 'learning_rate': 5.1575456053068e-06, 'epoch': 0.91}
{'loss': 1.1141, 'learning_rate': 4.991708126036484e-06, 'epoch': 0.91}
{'loss': 0.84, 'learning_rate': 4.8258706467661695e-06, 'epoch': 0.91}
{'loss': 0.9763, 'learning_rate': 4.660033167495855e-06, 'epoch': 0.92}
{'loss': 0.7896, 'learning_rate': 4.494195688225539e-06, 'epoch': 0.92}
{'loss': 1.0991, 'learning_rate': 4.328358208955224e-06, 'epoch': 0.92}
{'loss': 0.977, 'learning_rate': 4.16252072968491e-06, 'epoch': 0.92}
{'loss': 0.7034, 'learning_rate': 3.996683250414593e-06, 'epoch': 0.93}
{'loss': 1.194, 'learning_rate': 3.830

In [16]:
trainer.evaluate(eval_dataset)

  0%|          | 0/558 [00:00<?, ?it/s]

{'eval_loss': 1.0339295864105225,
 'eval_accuracy': 0.8020334928229665,
 'eval_runtime': 192.1914,
 'eval_samples_per_second': 8.7,
 'eval_steps_per_second': 2.903,
 'epoch': 1.0}

In [None]:
# trainer.save_model()
# test_results = trainer.evaluate(test_dataset)
# trainer.log_metrics("test", test_results)
# trainer.save_metrics("test", test_results)
# trainer.save_state()
# trained_model = VideoMAEForMultilabelVideoClassification.from_pretrained("/mnt/JaHiD/Zahid/RnD/VideoMAEClassification/artifact_charades_original/checkpoint-4792")

In [None]:
def run_inference(model, video):
    """Utility to run inference given a model and test video.

    The video is assumed to be preprocessed already.
    """
    # (num_frames, num_channels, height, width)
    perumuted_sample_test_video = video.permute(1, 0, 2, 3)

    inputs = {
        "pixel_values": perumuted_sample_test_video.unsqueeze(0),
        # "labels": None#torch.zeros(len(id2label)).unsqueeze(0),
    }
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model = model.to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    return logits

In [None]:
for d in eval_dataset:
    sample_video = d
    break
logits = run_inference(model, sample_video['video'])