In [1]:
'''from transformers import VideoMAEConfig, VideoMAEForPreTraining, VideoMAEModel

# Initializing a VideoMAE videomae-base style configuration
configuration = VideoMAEConfig()
# Randomly initializing a model from the configuration
# model = VideoMAEModel(configuration)
# # Accessing the model configuration
# configuration = model.config
model = VideoMAEForPreTraining(configuration)
model'''

'from transformers import VideoMAEConfig, VideoMAEForPreTraining, VideoMAEModel\n\n# Initializing a VideoMAE videomae-base style configuration\nconfiguration = VideoMAEConfig()\n# Randomly initializing a model from the configuration\n# model = VideoMAEModel(configuration)\n# # Accessing the model configuration\n# configuration = model.config\nmodel = VideoMAEForPreTraining(configuration)\nmodel'

In [2]:
import datetime
import numpy as np
import time
import torch
import torch.backends.cudnn as cudnn
import json
import os
from pathlib import Path
from timm.models import create_model
from optim_factory import create_optimizer
from datasets import build_pretraining_dataset
from engine_for_pretraining import train_one_epoch
from utils import NativeScalerWithGradNormCount as NativeScaler
import utils
import modeling_pretrain


def get_model(args):
    print(f"Creating model: {args.model}")
    model = create_model(
        args.model,
        pretrained=False,
        drop_path_rate=args.drop_path,
        drop_block_rate=None,
        decoder_depth=args.decoder_depth,
        use_checkpoint=args.use_checkpoint
    )
    return model

'''
def main(args):
    utils.init_distributed_mode(args)

    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)

    cudnn.benchmark = True

    model = get_model(args)
    patch_size = model.encoder.patch_embed.patch_size
    print("Patch size = %s" % str(patch_size))
    args.window_size = (args.num_frames // 2, args.input_size // patch_size[0], args.input_size // patch_size[1])
    args.patch_size = patch_size

    # get dataset
    dataset_train = build_pretraining_dataset(args)


    num_tasks = utils.get_world_size()
    global_rank = utils.get_rank()
    sampler_rank = global_rank

    total_batch_size = args.batch_size * num_tasks
    num_training_steps_per_epoch = len(dataset_train) // total_batch_size

    sampler_train = torch.utils.data.DistributedSampler(
        dataset_train, num_replicas=num_tasks, rank=sampler_rank, shuffle=True
    )
    print("Sampler_train = %s" % str(sampler_train))


    if global_rank == 0 and args.log_dir is not None:
        os.makedirs(args.log_dir, exist_ok=True)
        log_writer = utils.TensorboardLogger(log_dir=args.log_dir)
    else:
        log_writer = None

    data_loader_train = torch.utils.data.DataLoader(
        dataset_train, sampler=sampler_train,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=args.pin_mem,
        drop_last=True,
        worker_init_fn=utils.seed_worker
    )

    model.to(device)
    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print("Model = %s" % str(model_without_ddp))
    print('number of params: {} M'.format(n_parameters / 1e6))

    args.lr = args.lr * total_batch_size / 256
    args.min_lr = args.min_lr * total_batch_size / 256
    args.warmup_lr = args.warmup_lr * total_batch_size / 256
    print("LR = %.8f" % args.lr)
    print("Batch size = %d" % total_batch_size)
    print("Number of training steps = %d" % num_training_steps_per_epoch)
    print("Number of training examples per epoch = %d" % (total_batch_size * num_training_steps_per_epoch))

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=False)
        model_without_ddp = model.module

    optimizer = create_optimizer(
        args, model_without_ddp)
    loss_scaler = NativeScaler()

    print("Use step level LR & WD scheduler!")
    lr_schedule_values = utils.cosine_scheduler(
        args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch,
        warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps,
    )
    if args.weight_decay_end is None:
        args.weight_decay_end = args.weight_decay
    wd_schedule_values = utils.cosine_scheduler(
        args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch)
    print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values)))

    utils.auto_load_model(
        args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
    torch.cuda.empty_cache()
    print(f"Start training for {args.epochs} epochs")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            data_loader_train.sampler.set_epoch(epoch)
        if log_writer is not None:
            log_writer.set_step(epoch * num_training_steps_per_epoch)
        train_stats = train_one_epoch(
            model, data_loader_train,
            optimizer, device, epoch, loss_scaler,
            args.clip_grad, log_writer=log_writer,
            start_steps=epoch * num_training_steps_per_epoch,
            lr_schedule_values=lr_schedule_values,
            wd_schedule_values=wd_schedule_values,
            patch_size=patch_size[0],
            normlize_target=args.normlize_target,
        )
        if args.output_dir:
            if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs:
                utils.save_model(
                    args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
                    loss_scaler=loss_scaler, epoch=epoch)

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     'epoch': epoch, 'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            if log_writer is not None:
                log_writer.flush()
            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
'''
from dataclasses import dataclass

@dataclass
class Config:
    batch_size: int = 64
    epochs: int = 800
    save_ckpt_freq: int = 50
    model: str = 'pretrain_videomae_base_patch16_224'
    decoder_depth: int = 4
    mask_type: str = 'tube'
    mask_ratio: float = 0.75
    input_size: int = 224
    drop_path: float = 0.0
    normlize_target: bool = True
    opt: str = 'adamw'
    opt_eps: float = 1e-08
    opt_betas: tuple = None
    clip_grad: float = None
    momentum: float = 0.9
    weight_decay: float = 0.05
    weight_decay_end: float = None
    lr: float = 0.00015
    warmup_lr: float = 1e-06
    min_lr: float = 1e-05
    warmup_epochs: int = 40
    warmup_steps: int = -1
    use_checkpoint: bool = False
    color_jitter: float = 0.0
    train_interpolation: str = 'bicubic'
    data_path: str = '/kinetics-400'
    imagenet_default_mean_and_std: bool = True
    num_frames: int = 16
    sampling_rate: int = 4
    output_dir: str = 'experiment_artifact'
    log_dir: str = None
    device: str = 'cuda'
    seed: int = 0
    resume: str = ''
    auto_resume: bool = True
    start_epoch: int = 0
    num_workers: int = 10
    pin_mem: bool = True
    world_size: int = 1
    local_rank: int = -1
    dist_on_itp: bool = False
    dist_url: str = 'env://'


In [3]:
# opts = get_args()
args = Config()
if args.output_dir:
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)

In [4]:
utils.init_distributed_mode(args)
device = torch.device(args.device)
# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
cudnn.benchmark = True

Not using distributed mode


In [5]:
model = get_model(args)
patch_size = model.encoder.patch_embed.patch_size
print("Patch size = %s" % str(patch_size))
args.window_size = (args.num_frames // 2, args.input_size // patch_size[0], args.input_size // patch_size[1])
args.patch_size = patch_size

Creating model: pretrain_videomae_base_patch16_224
Patch size = (16, 16)


In [6]:
model

PretrainVisionTransformer(
  (encoder): PretrainVisionTransformerEncoder(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): LayerN

In [None]:
# torch.onnx.export(model, X_test, 'videoMAE.onnx', input_names=["features"], output_names=["logits"])

In [9]:
from masking_generator import TubeMaskingGenerator
tube_mask_generarator = TubeMaskingGenerator(args.window_size, args.mask_ratio)
from datasets import  DataAugmentationForVideoMAE
data_agumentor = DataAugmentationForVideoMAE(args)

In [10]:
from kinetics import VideoMAE
def build_pretraining_dataset(args):
    transform = DataAugmentationForVideoMAE(args)
    dataset = VideoMAE(
        root=None,
        setting=args.data_path,
        video_ext='mp4',
        is_color=True,
        modality='rgb',
        new_length=args.num_frames,
        new_step=args.sampling_rate,
        transform=transform,
        temporal_jitter=False,
        video_loader=True,
        use_decord=True,
        lazy_init=True)
    print("Data Aug = %s" % str(transform))
    return dataset

dataset_train = build_pretraining_dataset(args)

Data Aug = (DataAugmentationForVideoMAE,
  transform = Compose(
    <transforms.GroupMultiScaleCrop object at 0x7f529f7a6410>
    <transforms.Stack object at 0x7f529f7a6500>
    <transforms.ToTorchFormatTensor object at 0x7f529f7a63b0>
    <transforms.GroupNormalize object at 0x7f529f7b6740>
),
  Masked position generator = Maks: total patches 1568, mask patches 1176,
)


In [11]:
import glob
from itertools import repeat
test_video_files = glob.glob("/home/jahid/Downloads/VideoMAE/kinetics-400/*.mp4")

for i in range(5):  #copy it n times
    test_video_files += test_video_files

dataset_train.clips = list(zip(test_video_files, repeat(None, len(test_video_files))))
len(test_video_files)

224

In [12]:
for d in dataset_train:
    print(d[0].shape)
    break

torch.Size([3, 16, 224, 224])


In [13]:
num_tasks = utils.get_world_size()
global_rank = utils.get_rank()
sampler_rank = global_rank

total_batch_size = args.batch_size * num_tasks
num_training_steps_per_epoch = len(dataset_train) // total_batch_size

sampler_train = torch.utils.data.DistributedSampler(
    dataset_train, num_replicas=num_tasks, rank=sampler_rank, shuffle=True
)
print("Sampler_train = %s" % str(sampler_train))

Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x7f529f7c0280>


In [14]:
if global_rank == 0 and args.log_dir is not None:
    os.makedirs(args.log_dir, exist_ok=True)
    log_writer = utils.TensorboardLogger(log_dir=args.log_dir)
else:
    log_writer = None

In [15]:
data_loader_train = torch.utils.data.DataLoader(
    dataset_train, sampler=sampler_train,
    batch_size=2,#args.batch_size,
    num_workers=args.num_workers,
    pin_memory=args.pin_mem,
    drop_last=True,
    worker_init_fn=utils.seed_worker
)

In [16]:
model.to(device)
model_without_ddp = model
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Model = %s" % str(model_without_ddp))
print('number of params: {} M'.format(n_parameters / 1e6))

args.lr = args.lr * total_batch_size / 256
args.min_lr = args.min_lr * total_batch_size / 256
args.warmup_lr = args.warmup_lr * total_batch_size / 256
print("LR = %.8f" % args.lr)
print("Batch size = %d" % total_batch_size)
print("Number of training steps = %d" % num_training_steps_per_epoch)
print("Number of training examples per epoch = %d" % (total_batch_size * num_training_steps_per_epoch))

Model = PretrainVisionTransformer(
  (encoder): PretrainVisionTransformerEncoder(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
    )
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1)

In [17]:
if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=False)
    model_without_ddp = model.module

optimizer = create_optimizer(
    args, model_without_ddp)
loss_scaler = NativeScaler()

print("Use step level LR & WD scheduler!")
lr_schedule_values = utils.cosine_scheduler(
    args.lr, args.min_lr, args.epochs, num_training_steps_per_epoch,
    warmup_epochs=args.warmup_epochs, warmup_steps=args.warmup_steps,
)
if args.weight_decay_end is None:
    args.weight_decay_end = args.weight_decay
wd_schedule_values = utils.cosine_scheduler(
    args.weight_decay, args.weight_decay_end, args.epochs, num_training_steps_per_epoch)
print("Max WD = %.7f, Min WD = %.7f" % (max(wd_schedule_values), min(wd_schedule_values)))

utils.auto_load_model(
    args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
torch.cuda.empty_cache()

Param groups = {
  "no_decay": {
    "weight_decay": 0.0,
    "params": [
      "mask_token",
      "encoder.patch_embed.proj.bias",
      "encoder.blocks.0.norm1.weight",
      "encoder.blocks.0.norm1.bias",
      "encoder.blocks.0.attn.q_bias",
      "encoder.blocks.0.attn.v_bias",
      "encoder.blocks.0.attn.proj.bias",
      "encoder.blocks.0.norm2.weight",
      "encoder.blocks.0.norm2.bias",
      "encoder.blocks.0.mlp.fc1.bias",
      "encoder.blocks.0.mlp.fc2.bias",
      "encoder.blocks.1.norm1.weight",
      "encoder.blocks.1.norm1.bias",
      "encoder.blocks.1.attn.q_bias",
      "encoder.blocks.1.attn.v_bias",
      "encoder.blocks.1.attn.proj.bias",
      "encoder.blocks.1.norm2.weight",
      "encoder.blocks.1.norm2.bias",
      "encoder.blocks.1.mlp.fc1.bias",
      "encoder.blocks.1.mlp.fc2.bias",
      "encoder.blocks.2.norm1.weight",
      "encoder.blocks.2.norm1.bias",
      "encoder.blocks.2.attn.q_bias",
      "encoder.blocks.2.attn.v_bias",
      "encoder.blocks

In [18]:
print(f"Start training for {args.epochs} epochs")
start_time = time.time()
for epoch in range(args.start_epoch, args.epochs):
    if args.distributed:
        data_loader_train.sampler.set_epoch(epoch)
    if log_writer is not None:
        log_writer.set_step(epoch * num_training_steps_per_epoch)
    train_stats = train_one_epoch(
        model, data_loader_train,
        optimizer, device, epoch, loss_scaler,
        args.clip_grad, log_writer=log_writer,
        start_steps=epoch * num_training_steps_per_epoch,
        lr_schedule_values=lr_schedule_values,
        wd_schedule_values=wd_schedule_values,
        patch_size=patch_size[0],
        normlize_target=args.normlize_target,
    )
    if args.output_dir:
        if (epoch + 1) % args.save_ckpt_freq == 0 or epoch + 1 == args.epochs:
            utils.save_model(
                args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
                loss_scaler=loss_scaler, epoch=epoch)

    log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                    'epoch': epoch, 'n_parameters': n_parameters}

    if args.output_dir and utils.is_main_process():
        if log_writer is not None:
            log_writer.flush()
        with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
            f.write(json.dumps(log_stats) + "\n")

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))

Start training for 800 epochs
Epoch: [0]  [  0/112]  eta: 0:06:08  lr: 0.000000  min_lr: 0.000000  loss: 1.4261 (1.4261)  loss_scale: 65536.0000 (65536.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9484 (0.9484)  time: 3.2900  data: 0.8717  max mem: 2248
Epoch: [0]  [ 10/112]  eta: 0:00:43  lr: 0.000003  min_lr: 0.000003  loss: 1.4139 (1.4000)  loss_scale: 65536.0000 (65536.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.9269 (0.9199)  time: 0.4300  data: 0.0794  max mem: 3355
Epoch: [0]  [ 20/112]  eta: 0:00:26  lr: 0.000006  min_lr: 0.000006  loss: 1.3688 (1.3726)  loss_scale: 65536.0000 (65536.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.8451 (0.8287)  time: 0.1428  data: 0.0002  max mem: 3355
Epoch: [0]  [ 30/112]  eta: 0:00:20  lr: 0.000009  min_lr: 0.000009  loss: 1.3056 (1.3451)  loss_scale: 65536.0000 (65536.0000)  weight_decay: 0.0500 (0.0500)  grad_norm: 0.6295 (0.7423)  time: 0.1422  data: 0.0002  max mem: 3355
Epoch: [0]  [ 40/112]  eta: 0:00:15  lr: 0.000013 

KeyboardInterrupt: 

In [11]:
import numpy as np

frames = 16 # number of frame per video
num_patches_per_frame = 16*16
num_masks_per_frame = int(16*16*.9) # 90% masking as mentined in the paper 

mask_per_frame = np.hstack([
    np.zeros(num_patches_per_frame - num_masks_per_frame),  # non-masked patch
    np.ones(num_masks_per_frame), # masked patch
])

np.random.shuffle(mask_per_frame)
mask_matrix = np.tile(mask_per_frame, (frames,1))
mask = mask_matrix.flatten()

print(mask_per_frame.shape, mask_matrix.shape, mask.shape)

(256,) (16, 256) (4096,)


In [5]:
help(np.tile)

Help on function tile in module numpy:

tile(A, reps)
    Construct an array by repeating A the number of times given by reps.
    
    If `reps` has length ``d``, the result will have dimension of
    ``max(d, A.ndim)``.
    
    If ``A.ndim < d``, `A` is promoted to be d-dimensional by prepending new
    axes. So a shape (3,) array is promoted to (1, 3) for 2-D replication,
    or shape (1, 1, 3) for 3-D replication. If this is not the desired
    behavior, promote `A` to d-dimensions manually before calling this
    function.
    
    If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it.
    Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as
    (1, 1, 2, 2).
    
    Note : Although tile may be used for broadcasting, it is strongly
    recommended to use numpy's broadcasting operations and functions.
    
    Parameters
    ----------
    A : array_like
        The input array.
    reps : array_like
        The number of repetitions of `A`

In [10]:
mask_matrix.shape

(16, 256)