In [1]:
# Install the required libraries
!pip install -U "tensorflow>=2.10.0"  # Ensure you have TensorFlow v2.10 or later
!pip install remotezip tqdm opencv-python
!pip install -q git+https://github.com/tensorflow/docs


Collecting remotezip
  Downloading remotezip-0.12.3-py3-none-any.whl.metadata (7.2 kB)
Downloading remotezip-0.12.3-py3-none-any.whl (8.1 kB)
Installing collected packages: remotezip
Successfully installed remotezip-0.12.3
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone


In [2]:
import tqdm
import random
import pathlib
import collections
import os
import cv2
import numpy as np
import remotezip as rz
import tensorflow as tf
import imageio
from IPython import display
from tensorflow_docs.vis import embed


In [3]:
URL = 'https://storage.googleapis.com/thumos14_files/UCF101_videos.zip'


In [4]:
def list_files_from_zip_url(zip_url):
    files = []
    with rz.RemoteZip(zip_url) as zip:
        for zip_info in zip.infolist():
            files.append(zip_info.filename)
    return files

files = list_files_from_zip_url(URL)
files = [f for f in files if f.endswith('.avi')]  # Filter only .avi files (videos)
print(files[:10])  # Display the first 10 video filenames


['UCF101/v_ApplyEyeMakeup_g01_c01.avi', 'UCF101/v_ApplyEyeMakeup_g01_c02.avi', 'UCF101/v_ApplyEyeMakeup_g01_c03.avi', 'UCF101/v_ApplyEyeMakeup_g01_c04.avi', 'UCF101/v_ApplyEyeMakeup_g01_c05.avi', 'UCF101/v_ApplyEyeMakeup_g01_c06.avi', 'UCF101/v_ApplyEyeMakeup_g02_c01.avi', 'UCF101/v_ApplyEyeMakeup_g02_c02.avi', 'UCF101/v_ApplyEyeMakeup_g02_c03.avi', 'UCF101/v_ApplyEyeMakeup_g02_c04.avi']


In [5]:
def get_class(fname):
    """ Retrieve the name of the class from the filename. """
    return fname.split('_')[-3]

def get_files_per_class(files):
    """ Return files categorized by class names. """
    files_for_class = collections.defaultdict(list)
    for fname in files:
        class_name = get_class(fname)
        files_for_class[class_name].append(fname)
    return files_for_class

files_for_class = get_files_per_class(files)
classes = list(files_for_class.keys())
print('Number of classes:', len(classes))


Number of classes: 101


In [6]:
NUM_CLASSES = 10
FILES_PER_CLASS = 50

def select_subset_of_classes(files_for_class, classes, files_per_class):
    files_subset = {}
    for class_name in classes:
        class_files = files_for_class[class_name]
        files_subset[class_name] = class_files[:files_per_class]
    return files_subset

files_subset = select_subset_of_classes(files_for_class, classes[:NUM_CLASSES], FILES_PER_CLASS)
print('Subset of files for selected classes:', files_subset)


Subset of files for selected classes: {'ApplyEyeMakeup': ['UCF101/v_ApplyEyeMakeup_g01_c01.avi', 'UCF101/v_ApplyEyeMakeup_g01_c02.avi', 'UCF101/v_ApplyEyeMakeup_g01_c03.avi', 'UCF101/v_ApplyEyeMakeup_g01_c04.avi', 'UCF101/v_ApplyEyeMakeup_g01_c05.avi', 'UCF101/v_ApplyEyeMakeup_g01_c06.avi', 'UCF101/v_ApplyEyeMakeup_g02_c01.avi', 'UCF101/v_ApplyEyeMakeup_g02_c02.avi', 'UCF101/v_ApplyEyeMakeup_g02_c03.avi', 'UCF101/v_ApplyEyeMakeup_g02_c04.avi', 'UCF101/v_ApplyEyeMakeup_g03_c01.avi', 'UCF101/v_ApplyEyeMakeup_g03_c02.avi', 'UCF101/v_ApplyEyeMakeup_g03_c03.avi', 'UCF101/v_ApplyEyeMakeup_g03_c04.avi', 'UCF101/v_ApplyEyeMakeup_g03_c05.avi', 'UCF101/v_ApplyEyeMakeup_g03_c06.avi', 'UCF101/v_ApplyEyeMakeup_g04_c01.avi', 'UCF101/v_ApplyEyeMakeup_g04_c02.avi', 'UCF101/v_ApplyEyeMakeup_g04_c03.avi', 'UCF101/v_ApplyEyeMakeup_g04_c04.avi', 'UCF101/v_ApplyEyeMakeup_g04_c05.avi', 'UCF101/v_ApplyEyeMakeup_g04_c06.avi', 'UCF101/v_ApplyEyeMakeup_g04_c07.avi', 'UCF101/v_ApplyEyeMakeup_g05_c01.avi', 'UCF10

In [8]:
def download_ucf_101_subset(zip_url, num_classes, splits, download_dir):
    """ Download a subset of the UCF101 dataset and split them into various parts, such as
        training, validation, and test.

        Args:
          zip_url: A URL with a ZIP file containing data.
          num_classes: Number of classes to use.
          splits: Dictionary specifying the training, validation, test, etc. (key) division of data
                  (value is number of files per split).
          download_dir: Directory to download data to.

        Return:
          Mapping of the directories containing the subsections of data.
    """
    files = list_files_from_zip_url(zip_url)
    for f in files:
        path = os.path.normpath(f)
        tokens = path.split(os.sep)
        if len(tokens) <= 2:
            files.remove(f)  # Remove that item from the list if it doesn't have a filename

    files_for_class = get_files_per_class(files)
    classes = list(files_for_class.keys())[:num_classes]

    # Shuffle classes
    for cls in classes:
        random.shuffle(files_for_class[cls])

    # Use only the classes we want
    files_for_class = {x: files_for_class[x] for x in classes}

    dirs = {}
    for split_name, split_count in splits.items():
        print(split_name, ":")
        split_dir = download_dir / split_name
        split_files, files_for_class = split_class_lists(files_for_class, split_count)
        download_from_zip(zip_url, split_dir, split_files)
        dirs[split_name] = split_dir

    return dirs


In [10]:
def split_class_lists(files_for_class, count):
    """ Returns the list of files belonging to a subset of data as well as the remainder of
        files that need to be downloaded.

        Args:
          files_for_class: Files belonging to a particular class of data.
          count: Number of files to download.

        Returns:
          Files belonging to the subset of data and dictionary of the remainder of files that need to be downloaded.
    """
    split_files = []
    remainder = {}
    for cls in files_for_class:
        split_files.extend(files_for_class[cls][:count])  # Select the first 'count' files for each class
        remainder[cls] = files_for_class[cls][count:]  # Remaining files
    return split_files, remainder


In [11]:
download_dir = pathlib.Path('./UCF101_subset/')
subset_paths = download_ucf_101_subset(URL,
                                       num_classes=NUM_CLASSES,
                                       splits={"train": 30, "val": 10, "test": 10},
                                       download_dir=download_dir)


train :


100%|██████████| 300/300 [00:28<00:00, 10.63it/s]


val :


100%|██████████| 100/100 [00:07<00:00, 12.66it/s]


test :


100%|██████████| 100/100 [00:06<00:00, 15.55it/s]
