In [37]:
import base64
import os
import ssl
import re
import random
from urllib import request
import cv2
import imageio
import numpy as np
import scipy.io.wavfile
import ffmpeg
import shutil

from IPython.display import HTML

In [48]:
UCF_ROOT_URL = 'https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/'
context = ssl._create_unverified_context()

def ucf_label2name_dict():
  idx = request.urlopen(UCF_ROOT_URL, context=context).read().decode('utf-8')
  videos = sorted(list(set(re.findall('(v_[\w_]+\.avi)', idx))))
  output = {}
  for video in videos:
    label = re.findall('v_(.*)_g', video)[0]
    output.setdefault(label, []).append(video)
  return output

def fetch_ucf_video(video, destination):
  urlpath = request.urljoin(UCF_ROOT_URL, video)
  print(f'Fetching {urlpath} -> {destination}')
  data = request.urlopen(urlpath, context=context).read()
  open(destination, "wb").write(data)
  return destination

def prepare_ucf_dataset(destination='ucf101', num_per_class=None, random_sample=False):
  cnt = 0
  for label, videos in ucf_label2name_dict().items():
    if num_per_class:
      if random_sample:
        videos = random.sample(videos, num_per_class)
      else:
        videos = videos[:num_per_class]
    for video in videos:
      data_name = re.findall('(.*).avi', video)[0]
      data_dir = os.path.join(destination, label, data_name)
      os.makedirs(data_dir, exist_ok=True)
      video_dest = os.path.join(data_dir, video)
      audio_dest = os.path.join(data_dir, video.replace('.avi', '.wav'))
      if not os.path.exists(video_dest):
        fetch_ucf_video(video, video_dest)
        try:
          ffmpeg.run(ffmpeg.output(ffmpeg.input(video_dest), audio_dest))
        except:
          shutil.rmtree(os.path.join(destination, label))
          break
        cnt += 1
  return cnt

1.5h

In [None]:
print('video count:', prepare_ucf_dataset(num_per_class=50))

In [69]:
for label in sorted(os.listdir('ucf101')):
    print(label, len(os.listdir(os.path.join('ucf101', label))))

ApplyEyeMakeup 50
ApplyLipstick 50
Archery 50
BabyCrawling 50
BalanceBeam 50
BandMarching 50
BasketballDunk 50
BlowDryHair 50
BlowingCandles 50
BodyWeightSquats 50
Bowling 50
BoxingPunchingBag 50
BoxingSpeedBag 50
BrushingTeeth 50
CliffDiving 50
CricketBowling 50
CricketShot 50
CuttingInKitchen 50
FieldHockeyPenalty 50
FloorGymnastics 50
FrisbeeCatch 50
FrontCrawl 50
Haircut 50
HammerThrow 50
Hammering 50
HandstandPushups 50
HandstandWalking 50
HeadMassage 50
IceDancing 50
Knitting 50
LongJump 50
MoppingFloor 50
ParallelBars 50
PlayingCello 50
PlayingDaf 50
PlayingDhol 50
PlayingFlute 50
PlayingSitar 50
Rafting 50
ShavingBeard 50
Shotput 50
SkyDiving 50
SoccerPenalty 50
StillRings 50
SumoWrestling 50
Surfing 50
TableTennisShot 50
Typing 50
UnevenBars 50
WallPushups 50
WritingOnBoard 50


In [51]:
import torch
import torchaudio
import matplotlib.pyplot as plt

def crop_center_square(frame):
  h, w = frame.shape[0:2]
  min_dim = min(h, w)
  sx = (w//2) - (min_dim//2)
  sy = (h//2) - (min_dim//2)
  return frame[sy : sy+min_dim, sx : sx+min_dim]

def load_video(path, max_time=None, resize=(224, 224)):
  cap = cv2.VideoCapture(path)
  frames=[]
  try:
    while True:
      ret, frame = cap.read()
      if not ret: break
      # frame = crop_center_square(frame)
      # frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)
      if max_time:
        if cap.get(cv2.CAP_PROP_POS_MSEC) * 1e-3 >= max_time:
          break
  finally:
    cap.release()
  return torch.tensor(frames) / 255.0

def load_audio(path, max_time=None):
  waveform, sample_rate = torchaudio.load(path)
  if max_time:
    max_frame = int(max_time * sample_rate)
    waveform = waveform[:, :max_frame]
  return waveform, sample_rate


In [73]:
import torchvision
from torchvision.datasets import DatasetFolder
from torch.utils.data import Dataset

def loader(path, max_time):
  datum = os.listdir(path)
  for data in datum:
    if data.endswith('.avi'):
      video = load_video(os.path.join(path, data), max_time=max_time)
    elif data.endswith('.wav'):
      audio, _ = load_audio(os.path.join(path, data), max_time=max_time)
  return video, audio


class UCFDataset(DatasetFolder):
  def __init__(self, root, loader, max_time=None):
    self.root = root
    self.loader = loader
    self.max_time = max_time
    self.classes, self.class_to_idx = super().find_classes(root)
    self.samples = self.make_dataset(self.root, self.class_to_idx)

  def make_dataset(self, directory, class_to_idx):
    instances = []
    for target_class in sorted(class_to_idx.keys()):
      class_idx = class_to_idx[target_class]
      target_dir = os.path.join(directory, target_class)
      if not os.path.isdir(target_dir):
        continue
      for data_name in sorted(os.listdir(target_dir)):
        path = os.path.join(target_dir, data_name)
        item = path, class_idx
        instances.append(item)
    return instances

  def __getitem__(self, idx):
    path, target = self.samples[idx]
    sample = self.loader(path, self.max_time)
    return sample, target


class SetTransform(Dataset):
  def __init__(self, dataset, transform):
    self.dataset = dataset
    self.transform = transform

  def __getitem__(self, idx):
    (video, audio), label = self.dataset[idx]
    video = self.transform(video)
    return (video, audio), label

  def __len__(self):
    return len(self.dataset)


In [74]:
all_dataset = UCFDataset('ucf101', loader=loader, max_time=5)

In [76]:
(video, audio), label = all_dataset[0]
print(video.shape)
print(audio.shape)
print(label)

torch.Size([125, 240, 320, 3])
torch.Size([2, 220500])
0
