In [1]:
import base64
import os
import ssl
import re
import random
from urllib import request
import cv2
import imageio
import numpy as np
import scipy.io.wavfile
import ffmpeg
import shutil
from tqdm import tqdm

from IPython.display import HTML

In [2]:
UCF_ROOT_URL = 'https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/'
context = ssl._create_unverified_context()

def ucf_label2name_dict():
  idx = request.urlopen(UCF_ROOT_URL, context=context).read().decode('utf-8')
  videos = sorted(list(set(re.findall('(v_[\w_]+\.avi)', idx))))
  output = {}
  for video in videos:
    label = re.findall('v_(.*)_g', video)[0]
    output.setdefault(label, []).append(video)
  return output

def fetch_ucf_video(video, destination):
  urlpath = request.urljoin(UCF_ROOT_URL, video)
  print(f'Fetching {urlpath} -> {destination}')
  data = request.urlopen(urlpath, context=context).read()
  open(destination, "wb").write(data)
  return destination

def prepare_ucf_dataset(destination='ucf101', num_per_class=None, random_sample=False):
  cnt = 0
  for label, videos in tqdm(ucf_label2name_dict().items()):
    if num_per_class:
      if random_sample:
        videos = random.sample(videos, num_per_class)
      else:
        videos = videos[:num_per_class]
    for video in videos:
      data_name = re.findall('(.*).avi', video)[0]
      data_dir = os.path.join(destination, label, data_name)
      os.makedirs(data_dir, exist_ok=True)
      video_dest = os.path.join(data_dir, video)
      audio_dest = os.path.join(data_dir, video.replace('.avi', '.wav'))
      if not os.path.exists(video_dest):
        fetch_ucf_video(video, video_dest)
        try:
          ffmpeg.run(ffmpeg.output(ffmpeg.input(video_dest), audio_dest))
        except:
          shutil.rmtree(os.path.join(destination, label))
          break
        cnt += 1
  return cnt

1.5h

In [3]:
print('video count:', prepare_ucf_dataset(num_per_class=200))

KeyboardInterrupt: 

In [None]:
cnt=0
for label in sorted(os.listdir('ucf101')):
    print(label, len(os.listdir(os.path.join('ucf101', label))))
    cnt+=1
print(cnt)

ApplyEyeMakeup 145
ApplyLipstick 114
Archery 145
BabyCrawling 132
BalanceBeam 108
BandMarching 155
BlowDryHair 131
BlowingCandles 109
BodyWeightSquats 112
Bowling 155
BoxingPunchingBag 163
BoxingSpeedBag 134
BrushingTeeth 131
CliffDiving 138
CricketBowling 139
CricketShot 167
CuttingInKitchen 110
FieldHockeyPenalty 126
FloorGymnastics 125
FrisbeeCatch 126
FrontCrawl 137
Haircut 130
HammerThrow 150
Hammering 140
HandstandWalking 111
HeadMassage 147
IceDancing 158
Knitting 123
LongJump 131
MoppingFloor 110
ParallelBars 114
PlayingCello 164
PlayingDaf 151
PlayingDhol 164
PlayingFlute 155
PlayingSitar 157
Rafting 111
ShavingBeard 161
Shotput 144
SkyDiving 110
SoccerPenalty 137
StillRings 112
SumoWrestling 116
Surfing 126
TableTennisShot 140
Typing 136
UnevenBars 104
WallPushups 130
WritingOnBoard 152
49


In [3]:
import torch
import torchaudio
from pytorchvideo.transforms import Permute
import matplotlib.pyplot as plt

def crop_center_square(frame):
  h, w = frame.shape[0:2]
  min_dim = min(h, w)
  sx = (w//2) - (min_dim//2)
  sy = (h//2) - (min_dim//2)
  return frame[sy : sy+min_dim, sx : sx+min_dim]

def load_video(path, n_frames=None):
  cap = cv2.VideoCapture(path)
  frames=[]
  frame_rate = cap.get(cv2.CAP_PROP_FPS)
  if not n_frames:
    n_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
  try:
    while True:
      ret, frame = cap.read()
      if not ret: break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, (224, 224))
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)
      if n_frames:
        if len(frames) == n_frames : break
  finally:
    cap.release()
  video_tensor = torch.tensor(frames) / 255.0
  video_tensor = torch.permute(video_tensor, (0, 3, 1, 2))
  video_shape = video_tensor.shape
  if video_shape[0] < n_frames:
    pad = torch.zeros((n_frames - video_shape[0], video_shape[1], video_shape[2], video_shape[3]))
    video_tensor = torch.concat((video_tensor, pad))
  return video_tensor, n_frames / frame_rate

def load_audio(path, len_time=None, sample_rate=None):
  waveform, raw_sample_rate = torchaudio.load(path)
  if sample_rate:
    waveform = torchaudio.transforms.Resample(raw_sample_rate, sample_rate)(waveform)
  else: sample_rate = raw_sample_rate
  if waveform.shape[0] != 1:
    waveform = torch.mean(waveform, dim=0) #stereo -> mono
  waveform = waveform.squeeze()
  if len_time:
    n_frames = int(len_time * sample_rate)
    waveform = waveform[:n_frames]
  if waveform.shape[0] < n_frames:
    pad = torch.zeros(n_frames - waveform.shape[0])
    waveform = torch.concat((waveform, pad))
  return waveform.unsqueeze(0).unsqueeze(-1), sample_rate


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
l=[]
fps = []
dir = list(os.walk('ucf101'))
for root, dirs, files in tqdm(dir):
    for f in files:
        video = os.path.join(root, f)
        if video.endswith('.avi'):
            cap = cv2.VideoCapture(video)
            l.append(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps.append(cap.get(cv2.CAP_PROP_FPS))
            cap.release()
print(sorted(l))
print(sorted(fps))


In [None]:
load_video('ucf101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01/v_ApplyEyeMakeup_g01_c01.avi', 100)

(tensor([[[[0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           [0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           [0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           ...,
           [0.0549, 0.0784, 0.0784,  ..., 0.4627, 0.4745, 0.4784],
           [0.0157, 0.0314, 0.1569,  ..., 0.4627, 0.4745, 0.4784],
           [0.0745, 0.0745, 0.2392,  ..., 0.4627, 0.4745, 0.4784]],
 
          [[0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           [0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           [0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           ...,
           [0.0549, 0.0784, 0.0784,  ..., 0.4627, 0.4745, 0.4784],
           [0.0157, 0.0314, 0.1569,  ..., 0.4627, 0.4745, 0.4784],
           [0.0745, 0.0745, 0.2392,  ..., 0.4627, 0.4745, 0.4784]],
 
          [[0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           [0.2471, 0.2471, 0.2431,  ..., 0.9294, 0.9294, 0.9294],
           [0.2471, 0.24

In [None]:
load_audio('ucf101/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01/v_ApplyEyeMakeup_g01_c01.wav', max_time=3)

(tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0625, -0.0687, -0.0761]), 44100)

In [4]:
import torchvision
from torchvision.datasets import DatasetFolder
from torch.utils.data import Dataset
import torch.nn.functional as F

def loader(path, n_frames, sample_rate):
  datum = os.listdir(path)
  for data in datum:
    if data.endswith('.avi'):
      video, time = load_video(os.path.join(path, data), n_frames=n_frames)
  for data in datum:
    if data.endswith('.wav'):
      audio, _ = load_audio(os.path.join(path, data), len_time=time, sample_rate=sample_rate)
  return video, audio


class UCFDataset(DatasetFolder):
  def __init__(self, root, loader, n_video_frames=None, audio_sample_rate=None):
    self.root = root
    self.loader = loader
    self.n_frames = n_video_frames
    self.sample_rate = audio_sample_rate
    self.classes, self.class_to_idx = super().find_classes(root)
    self.samples = self.make_dataset(self.root, self.class_to_idx)

  def make_dataset(self, directory, class_to_idx):
    instances = []
    for target_class in sorted(class_to_idx.keys()):
      class_idx = class_to_idx[target_class]
      target_dir = os.path.join(directory, target_class)
      if not os.path.isdir(target_dir):
        continue
      for data_name in sorted(os.listdir(target_dir)):
        path = os.path.join(target_dir, data_name)
        item = path, F.one_hot(torch.tensor(class_idx), num_classes=self.num_labels)
        instances.append(item)
    return instances

  def __getitem__(self, idx):
    path, target = self.samples[idx]
    (video, audio) = self.loader(path, self.n_frames, self.sample_rate)
    return video, audio, target
  
  @property
  def num_labels(self):
    return len(os.listdir(self.root))
  
  @property
  def id2label(self):
    return {v:k for k,v in self.class_to_idx.items()}
  
  @property
  def label2id(self):
    return self.class_to_idx


class SetTransform(Dataset):
  def __init__(self, dataset, transform):
    self.dataset = dataset
    self.transform = transform

  def __getitem__(self, idx):
    video, audio, label = self.dataset[idx]
    video = self.transform(video)
    return video, audio, label

  def __len__(self):
    return len(self.dataset)


In [5]:
from torchvision.transforms import (Compose,
                                    Resize)

all_dataset = UCFDataset('ucf101', loader=loader, n_video_frames=20, audio_sample_rate=16000)
len(all_dataset)

6586

In [6]:
val_size = 1000
test_size = 1000
train_size = len(all_dataset) - val_size - test_size

seed = 0
test_dataset, trainval_dataset = torch.utils.data.random_split(all_dataset, [test_size, train_size + val_size], generator=torch.Generator().manual_seed(seed))
train_dataset, val_dataset = torch.utils.data.random_split(trainval_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(seed))

print('train:', len(train_dataset))
print('validation:', len(val_dataset))
print('test:', len(test_dataset))

train: 4586
validation: 1000
test: 1000


In [7]:
video, audio, label = all_dataset[0]
print(video.shape)
print(audio.shape)
print(label.shape)

torch.Size([20, 3, 224, 224])
torch.Size([1, 12800, 1])
torch.Size([49])


  video_tensor = torch.tensor(frames) / 255.0


In [8]:
def collate_fn(examples):
    videos, audios, labels = zip(*examples)
    videos = torch.stack(videos)
    audios = torch.stack(audios)
    labels = torch.stack(labels)
    return {'inputs': {'image': videos, 'audio': audios, 'label': labels}}

In [9]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=2)

batch = next(iter(train_dataloader))
for k,v in batch.items():
    for _,v_ in v.items():
        print(v_.shape)

torch.Size([2, 20, 3, 224, 224])
torch.Size([2, 1, 12800, 1])
torch.Size([2, 49])


In [10]:
from transformers import PerceiverForMultimodalAutoencoding

model = PerceiverForMultimodalAutoencoding.from_pretrained("deepmind/multimodal-perceiver",
                                                           num_labels = all_dataset.num_labels,
                                                           id2label=all_dataset.id2label,
                                                           label2id=all_dataset.label2id,
                                                           ignore_mismatched_sizes=True)


Some weights of PerceiverForMultimodalAutoencoding were not initialized from the model checkpoint at deepmind/multimodal-perceiver and are newly initialized because the shapes did not match:
- perceiver.input_preprocessor.padding.audio: found shape torch.Size([1, 303]) in the checkpoint and torch.Size([1, 4]) in the model instantiated
- perceiver.input_preprocessor.padding.image: found shape torch.Size([1, 461]) in the checkpoint and torch.Size([1, 162]) in the model instantiated
- perceiver.input_preprocessor.padding.label: found shape torch.Size([1, 4]) in the checkpoint and torch.Size([1, 356]) in the model instantiated
- perceiver.input_preprocessor.mask.audio: found shape torch.Size([1, 704]) in the checkpoint and torch.Size([1, 405]) in the model instantiated
- perceiver.input_preprocessor.mask.image: found shape torch.Size([1, 704]) in the checkpoint and torch.Size([1, 405]) in the model instantiated
- perceiver.input_preprocessor.mask.label: found shape torch.Size([1, 704]) in 

In [11]:
from transformers import TrainingArguments, Trainer

metric_name = "accuracy"

args = TrainingArguments(
    f"perceiver-ucf",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    remove_unused_columns=False,
)

In [12]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 4586
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 1377
  Number of trainable parameters = 18552896
  0%|          | 0/1377 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.86 GiB (GPU 0; 9.77 GiB total capacity; 6.03 GiB already allocated; 1.86 GiB free; 6.49 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF