In [1]:
import base64
import os
import ssl
import re
import random
from urllib import request
import cv2
import numpy as np
import ffmpeg
import shutil
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [3]:
UCF_ROOT_URL = 'https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/'
context = ssl._create_unverified_context()

def ucf_label2name_dict():
  idx = request.urlopen(UCF_ROOT_URL, context=context).read().decode('utf-8')
  videos = sorted(list(set(re.findall('(v_[\w_]+\.avi)', idx))))
  output = {}
  for video in videos:
    label = re.findall('v_(.*)_g', video)[0]
    output.setdefault(label, []).append(video)
  return output

def fetch_ucf_video(video, destination):
  urlpath = request.urljoin(UCF_ROOT_URL, video)
  print(f'Fetching {urlpath} -> {destination}')
  data = request.urlopen(urlpath, context=context).read()
  open(destination, "wb").write(data)
  return destination

def prepare_ucf_dataset(destination='ucf101', num_per_class=None, random_sample=False):
  cnt = 0
  for label, videos in tqdm(ucf_label2name_dict().items()):
    if num_per_class:
      if random_sample:
        videos = random.sample(videos, num_per_class)
      else:
        videos = videos[:num_per_class]
    for video in videos:
      data_name = re.findall('(.*).avi', video)[0]
      data_dir = os.path.join(destination, label, data_name)
      os.makedirs(data_dir, exist_ok=True)
      video_dest = os.path.join(data_dir, video)
      audio_dest = os.path.join(data_dir, video.replace('.avi', '.wav'))
      if not os.path.exists(video_dest):
        fetch_ucf_video(video, video_dest)
        try:
          ffmpeg.run(ffmpeg.output(ffmpeg.input(video_dest), audio_dest))
        except:
          shutil.rmtree(os.path.join(destination, label))
          break
        cnt += 1
  return cnt

In [4]:
# print('video count:', prepare_ucf_dataset(num_per_class=200))

In [5]:
import torch
import torchaudio
from pytorchvideo.transforms import Permute
import matplotlib.pyplot as plt

def crop_center_square(frame):
  h, w = frame.shape[0:2]
  min_dim = min(h, w)
  sx = (w//2) - (min_dim//2)
  sy = (h//2) - (min_dim//2)
  return frame[sy : sy+min_dim, sx : sx+min_dim]

def load_video(path, n_frames=None):
  cap = cv2.VideoCapture(path)
  frames=[]
  frame_rate = cap.get(cv2.CAP_PROP_FPS)
  if not n_frames:
    n_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
  try:
    while True:
      ret, frame = cap.read()
      if not ret: break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, (224, 224))
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)
      if n_frames:
        if len(frames) == n_frames : break
  finally:
    cap.release()
  video_tensor = torch.tensor(frames) / 255.0
  video_tensor = torch.permute(video_tensor, (0, 3, 1, 2)) # THWC -> TCHW
  video_shape = video_tensor.shape
  if video_shape[0] < n_frames:
    pad = torch.zeros((n_frames - video_shape[0], video_shape[1], video_shape[2], video_shape[3]))
    video_tensor = torch.concat((video_tensor, pad))
  return video_tensor, frame_rate

def load_audio(path, samples_per_frame=None, video_frame_rate=None, n_video_frames=None):
  waveform, raw_sample_rate = torchaudio.load(path)
  if samples_per_frame and video_frame_rate:
    sample_rate = samples_per_frame * video_frame_rate
    waveform = torchaudio.transforms.Resample(raw_sample_rate, sample_rate)(waveform)
  else: sample_rate = raw_sample_rate
  if waveform.shape[0] != 1:
    waveform = torch.mean(waveform, dim=0) #stereo -> mono
  waveform = waveform.squeeze().unsqueeze(1)
  if samples_per_frame and n_video_frames:
    n_samples = samples_per_frame * n_video_frames
    if waveform.shape[0] < n_samples:
      pad = torch.zeros(n_samples - waveform.shape[0], 1)
      waveform = torch.concat((waveform, pad))
    else:
      waveform = waveform[:n_samples]
  return waveform, sample_rate


In [6]:
video, fps = load_video('ucf101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01/v_ApplyEyeMakeup_g01_c01.avi', 16)
print(video.shape)
print(fps)

torch.Size([16, 3, 224, 224])
25.0


  video_tensor = torch.tensor(frames) / 255.0


In [7]:
audio, sample_rate = load_audio('ucf101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01/v_ApplyEyeMakeup_g01_c01.wav', samples_per_frame=1920, video_frame_rate=fps, n_video_frames=16)
print(audio.shape)
print(sample_rate)

torch.Size([30720, 1])
48000.0


In [8]:
import torchvision
from torchvision.datasets import DatasetFolder
from torch.utils.data import Dataset
import torch.nn.functional as F

def loader(path, n_frames, audio_samples_per_frame):
  datum = os.listdir(path)
  for data in datum:
    if data.endswith('.avi'):
      video, fps = load_video(os.path.join(path, data), n_frames=n_frames)
  for data in datum:
    if data.endswith('.wav'):
      audio, _ = load_audio(os.path.join(path, data),
                            samples_per_frame=audio_samples_per_frame,
                            video_frame_rate=fps,
                            n_video_frames=n_frames)
  return video, audio


class UCFDataset(DatasetFolder):
  def __init__(self, root, loader, n_video_frames=None, audio_samples_per_frame=None):
    self.root = root
    self.loader = loader
    self.n_frames = n_video_frames
    self.audio_samples_per_frames = audio_samples_per_frame
    self.classes, self.class_to_idx = super().find_classes(root)
    self.samples = self.make_dataset(self.root, self.class_to_idx)

  def make_dataset(self, directory, class_to_idx):
    instances = []
    for target_class in sorted(class_to_idx.keys()):
      class_idx = class_to_idx[target_class]
      target_dir = os.path.join(directory, target_class)
      if not os.path.isdir(target_dir):
        continue
      for data_name in sorted(os.listdir(target_dir)):
        path = os.path.join(target_dir, data_name)
        # item = path, F.one_hot(torch.tensor(class_idx), num_classes=self.num_labels)
        item = path, torch.tensor(class_idx)
        instances.append(item)
    return instances

  def __getitem__(self, idx):
    path, target = self.samples[idx]
    (video, audio) = self.loader(path, self.n_frames, self.audio_samples_per_frames)
    return video, audio, target
  
  @property
  def num_labels(self):
    return len(os.listdir(self.root))
  
  @property
  def id2label(self):
    return {v:k for k,v in self.class_to_idx.items()}
  
  @property
  def label2id(self):
    return self.class_to_idx


class SetTransform(Dataset):
  def __init__(self, dataset, transform):
    self.dataset = dataset
    self.transform = transform

  def __getitem__(self, idx):
    video, audio, label = self.dataset[idx]
    video = self.transform(video)
    return video, audio, label

  def __len__(self):
    return len(self.dataset)


In [9]:
from torchvision.transforms import (Compose,
                                    Resize)
from transformers import PerceiverForMultimodalAutoencoding

pretrained_model = PerceiverForMultimodalAutoencoding.from_pretrained("deepmind/multimodal-perceiver")
all_dataset = UCFDataset('ucf101',
                         loader=loader,
                         n_video_frames=pretrained_model.config.num_frames,
                         audio_samples_per_frame=pretrained_model.config.audio_samples_per_frame)
len(all_dataset)

6586

In [10]:
val_size = 1000
test_size = 1000
train_size = len(all_dataset) - val_size - test_size

seed = 0
test_dataset, trainval_dataset = torch.utils.data.random_split(all_dataset, [test_size, train_size + val_size], generator=torch.Generator().manual_seed(seed))
train_dataset, val_dataset = torch.utils.data.random_split(trainval_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(seed))

print('train:', len(train_dataset))
print('validation:', len(val_dataset))
print('test:', len(test_dataset))

train: 4586
validation: 1000
test: 1000


In [11]:
video, audio, label = all_dataset[0]
print(video.shape)
print(audio.shape)
print(label.shape)

torch.Size([16, 3, 224, 224])
torch.Size([30720, 1])
torch.Size([])


In [12]:
def collate_fn(examples):
    videos, audios, labels = zip(*examples)
    videos = torch.stack(videos)
    audios = torch.stack(audios)
    onehot_labels = F.one_hot(torch.stack(labels), num_classes=all_dataset.num_labels)
    labels = torch.stack(labels)
    subsampled_output_points = {
        "image": torch.arange(0, 0),
        "audio": torch.arange(0, 0),
        "label": None,
    }
    return dict(image=videos,
                audio=audios,
                # label=onehot_labels,
                labels=labels,
                subsampled_output_points=subsampled_output_points)

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=4)

batch = next(iter(train_dataloader))
def print_dict(d, c):
    for k,v in d.items():
        if isinstance(v, dict):
            print('\t'*c, k)
            print_dict(v, c+1)
        elif isinstance(v, torch.Tensor):
            print('\t'*c, k)
            print('\t'*(c+1), v.shape)
        else:
            print('\t'*c, k)
            print('\t'*(c+1), v)
print_dict(batch, 0)

 image
	 torch.Size([4, 16, 3, 224, 224])
 audio
	 torch.Size([4, 30720, 1])
 labels
	 torch.Size([4])
 subsampled_output_points
	 image
		 torch.Size([0])
	 audio
		 torch.Size([0])
	 label
		 None


In [14]:
import torch.nn as nn
from transformers.models.perceiver.modeling_perceiver import PerceiverClassifierOutput

class TrainablePerceiverForMultimodalAutoencoding(PerceiverForMultimodalAutoencoding):
    def forward(
        self,
        image=None,
        audio=None,
        attention_mask=None,
        subsampled_output_points=None,
        head_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        labels=None,
        return_dict=None,
    ):
        return_dict = self.config.use_return_dict
        inputs = {
            'image': image,
            'audio': audio,
            'label': F.one_hot(labels, num_classes=self.config.num_labels),
        }
        outputs = self.perceiver(
            inputs=inputs,
            attention_mask=attention_mask,
            subsampled_output_points=subsampled_output_points,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        logits = outputs.logits if return_dict else outputs[0]

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits['label'].view(-1, self.config.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return PerceiverClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )

In [15]:
model = TrainablePerceiverForMultimodalAutoencoding.from_pretrained("deepmind/multimodal-perceiver",
                                                                    num_labels=all_dataset.num_labels,
                                                                    id2label=all_dataset.id2label,
                                                                    label2id=all_dataset.label2id,
                                                                    ignore_mismatched_sizes=True)

Some weights of TrainablePerceiverForMultimodalAutoencoding were not initialized from the model checkpoint at deepmind/multimodal-perceiver and are newly initialized because the shapes did not match:
- perceiver.input_preprocessor.padding.audio: found shape torch.Size([1, 303]) in the checkpoint and torch.Size([1, 4]) in the model instantiated
- perceiver.input_preprocessor.padding.image: found shape torch.Size([1, 461]) in the checkpoint and torch.Size([1, 162]) in the model instantiated
- perceiver.input_preprocessor.padding.label: found shape torch.Size([1, 4]) in the checkpoint and torch.Size([1, 356]) in the model instantiated
- perceiver.input_preprocessor.mask.audio: found shape torch.Size([1, 704]) in the checkpoint and torch.Size([1, 405]) in the model instantiated
- perceiver.input_preprocessor.mask.image: found shape torch.Size([1, 704]) in the checkpoint and torch.Size([1, 405]) in the model instantiated
- perceiver.input_preprocessor.mask.label: found shape torch.Size([1, 

In [16]:
from transformers import TrainingArguments, Trainer

metric_name = "accuracy"

args = TrainingArguments(
    f"perceiver-ucf",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    remove_unused_columns=False,
)

In [17]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

In [19]:
# inputs = next(iter(train_dataloader))
# nchunks = 128
# image_chunk_size = np.prod((16, 224, 224)) // nchunks
# audio_chunk_size = audio.shape[1] // model.config.samples_per_patch // nchunks
# print(model.config.samples_per_patch)
# # process the first chunk
# chunk_idx = 0
# subsampling = {
#     "image": torch.arange(0, 0),
#     "audio": torch.arange(0, 0),
#     "label": None,
# }
# print(inputs['inputs']['audio'].shape)

# outputs = model(**inputs, subsampled_output_points=subsampling)
# print(outputs.logits['audio'].shape)
# print(outputs.logits['image'].shape)
# print(outputs.logits['label'].shape)

In [20]:
trainer.train()

***** Running training *****
  Num examples = 4586
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3441
  Number of trainable parameters = 18552896
  0%|          | 0/3441 [00:00<?, ?it/s]Could not estimate the number of tokens of the input, floating-point operations will not be computed
 15%|█▍        | 500/3441 [19:15<1:52:06,  2.29s/it]

{'loss': 3.9216, 'learning_rate': 4.2734670154025e-05, 'epoch': 0.44}


 29%|██▉       | 1000/3441 [38:34<1:32:08,  2.27s/it]

{'loss': 3.9057, 'learning_rate': 3.5469340308049985e-05, 'epoch': 0.87}


 33%|███▎      | 1147/3441 [44:10<1:13:35,  1.92s/it]***** Running Evaluation *****
  Num examples = 1000
  Batch size = 4


AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:
model.state_dict