# Facebook timeSformer

In [None]:
!pip install av



In [None]:
!pip install accelerate



## Setup

In [None]:
import av
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import numpy as np
import time
import os

from transformers import AutoImageProcessor, TimesformerForVideoClassification
from huggingface_hub import hf_hub_download

np.random.seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
frame_number = 8

def extract_frames(container, frame_number):
    stream = container.streams.video[0]
    frame_count = stream.frames

    # Ensure we have at least {frame_number} frames in the video
    if frame_count < frame_number:
        raise ValueError(f"Video must have at least {frame_number} frames.")

    frames = []
    selected_indices = np.linspace(0, frame_count - 1, frame_number, dtype=int)

    for index in selected_indices:
        # Seek to the desired frame
        container.seek(int(index))
        for frame in container.decode(video=0):
            frame = frame.to_ndarray(format='rgb24')
            frames.append(frame)
            break  # Nur den ersten Frame nehmen

    return np.array(frames)

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400", device_map=device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

## 1 Video

In [None]:
container = av.open("./videos/--_S9IDQPLg_000135_000145.mp4")

video = extract_frames(container, frame_number)

inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"].to(device)

without CUDA Graph

In [None]:
start_time = time.perf_counter()
# warm up
with torch.no_grad():
    for i in range(10):
        outputs = model(inputs).logits
torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/t4/one_video/no_cuda_graph'),
    record_shapes=True,
) as prof:

    with record_function("model_inference"):
        outputs = model(inputs).logits

    torch.cuda.synchronize()

    prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
predicted_label = outputs.argmax(-1).item()
print(model.config.id2label[predicted_label])
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with warmup:  {end_time - start_time}')

  warn("Profiler won't be using warmup, this can skew profiler results")


javelin throw
Time to complete:  0.48326465599996027
Time to complete with warmup:  1.7906076859999303


with CUDA Graph

In [None]:
static_input = torch.randn(1, 8, 3, 224, 224, device=device)
static_input.copy_(inputs)

start_time = time.perf_counter()
# warm up
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.no_grad(), torch.cuda.stream(s):
    for i in range(10):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/t4/one_video/cuda_graph'),
    record_shapes=True,
) as prof:

    with record_function("data copy"):
        # copy data
        static_input.copy_(inputs)

    with record_function("model_inference"):
        # replay
        g.replay()

        torch.cuda.synchronize()
    prof.step()

    end_time = time.perf_counter()


# model predicts one of the 400 Kinetics-400 classes
predicted_label = static_output.argmax(-1).item()
print(model.config.id2label[predicted_label])
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')

javelin throw
Time to complete:  0.21723499500001253
Time to complete with capture:  2.489692671999933


## Many Videos

In [None]:
folder_dir = "./videos"
videos = [os.path.join(folder_dir, file_name) for file_name in os.listdir(folder_dir) if file_name.endswith(".mp4")]

without CUDA Graph

In [None]:
pred = {}
count = 0

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"].to(device)

with torch.no_grad():
    for i in range(10):
        outputs = model(inputs).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/t4/many_videos/no_cuda_graph'),
    record_shapes=True,
) as prof:
    for i in range(len(videos)):
        with record_function("open video"):
            container = av.open(videos[i])

        with record_function("extract frames"):
            video_frames = extract_frames(container, frame_number)

        with record_function("process frames"):
            inputs = image_processor(list(video_frames), return_tensors="pt")
            inputs = inputs["pixel_values"].to(device)

        with record_function("model_inference"):
            outputs = model(inputs).logits

            torch.cuda.synchronize()

        with record_function("save pred"):
            pred[videos[i]] = model.config.id2label[outputs.argmax(-1).item()]

        count += 1
        prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')

for key, value in pred.items():
    print(f'{key}: {value}')

Time to complete:  38.38508995400002
Time to complete with capture:  40.205175184999916
Videos processed: 50
./videos/-0yuyrbruYM_000079_000089.mp4: flying kite
./videos/-0lqH3xAz6M_000014_000024.mp4: snowkiting
./videos/-1lbeA9Jogw_000000_000010.mp4: smoking hookah
./videos/-0cOo0cRVZU_000008_000018.mp4: high kick
./videos/-0mnCHRQ-Zc_000092_000102.mp4: making bed
./videos/-0oMsq-9b6c_000095_000105.mp4: sanding floor
./videos/-00fzD4K6aw_000007_000017.mp4: throwing axe
./videos/-1HT31BzADs_000118_000128.mp4: pole vault
./videos/--_S9IDQPLg_000135_000145.mp4: javelin throw
./videos/-1jQapks1hI_000053_000063.mp4: peeling potatoes
./videos/-0LoCy0-F9A_000018_000028.mp4: hula hooping
./videos/-1IqhhJEfTI_000015_000025.mp4: snowkiting
./videos/-0M6S1qBn8s_000243_000253.mp4: changing oil
./videos/-1_m2Igd2Yc_000209_000219.mp4: high kick
./videos/-1mbYmIZ9iw_000083_000093.mp4: breakdancing
./videos/-0HKFF7F_BY_000003_000013.mp4: yawning
./videos/-0WL_HWewTE_000025_000035.mp4: getting a tatto

with CUDA Graph

In [None]:
pred = {}
count = 0

static_input = torch.randn(1, frame_number, 3, 224, 224, device=device)

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"]
static_input.copy_(inputs)

s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s), torch.no_grad():
    for i in range(3):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/t4/many_videos/cuda_graph'),
    record_shapes=True,
) as prof:
    for i in range(len(videos)):
        with record_function("open video"):
            container = av.open(videos[i])

        with record_function("extract frames"):
            video = extract_frames(container, frame_number)

        with record_function("process frames"):
            inputs = image_processor(list(video), return_tensors="pt")
            inputs = inputs["pixel_values"]

        with record_function("data copy"):
            static_input.copy_(inputs)

        with record_function("model_inference"):
            g.replay()

            torch.cuda.synchronize()

        with record_function("save pred"):
            pred[videos[i]] = model.config.id2label[static_output.argmax(-1).item()]

        count += 1
        prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')
for key, value in pred.items():
    print(f'{key}: {value}')


  return torch.tensor(value)


Time to complete:  32.61106924699993
Time to complete with capture:  34.59512435099998
Videos processed: 50
./videos/-0yuyrbruYM_000079_000089.mp4: flying kite
./videos/-0lqH3xAz6M_000014_000024.mp4: snowkiting
./videos/-1lbeA9Jogw_000000_000010.mp4: smoking hookah
./videos/-0cOo0cRVZU_000008_000018.mp4: high kick
./videos/-0mnCHRQ-Zc_000092_000102.mp4: making bed
./videos/-0oMsq-9b6c_000095_000105.mp4: sanding floor
./videos/-00fzD4K6aw_000007_000017.mp4: throwing axe
./videos/-1HT31BzADs_000118_000128.mp4: pole vault
./videos/--_S9IDQPLg_000135_000145.mp4: javelin throw
./videos/-1jQapks1hI_000053_000063.mp4: peeling potatoes
./videos/-0LoCy0-F9A_000018_000028.mp4: hula hooping
./videos/-1IqhhJEfTI_000015_000025.mp4: snowkiting
./videos/-0M6S1qBn8s_000243_000253.mp4: changing oil
./videos/-1_m2Igd2Yc_000209_000219.mp4: high kick
./videos/-1mbYmIZ9iw_000083_000093.mp4: breakdancing
./videos/-0HKFF7F_BY_000003_000013.mp4: yawning
./videos/-0WL_HWewTE_000025_000035.mp4: getting a tattoo

In [None]:
pred = {}
count = 1

static_input = torch.randn(1, frame_number, 3, 224, 224, device=device)

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"]
static_input.copy_(inputs)

s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s), torch.no_grad():
    for i in range(10):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/t4/many_videos/cuda_graph_next_video'),
    record_shapes=True,
) as prof:

    # load first video
    with record_function("open video"):
        container = av.open(videos[0])

    with record_function("extract frames"):
        video = extract_frames(container, frame_number)

    with record_function("process frames"):
        inputs = image_processor(list(video), return_tensors="pt")

    with record_function("data copy"):
        static_input.copy_(inputs["pixel_values"])

    for i in range(1, len(videos)):
        with record_function("model_inference"):
            # replay
            g.replay()

        # load next video
        with record_function("load next video"):
            with record_function("open video"):
                container = av.open(videos[i])

            with record_function("extract frames"):
                video = extract_frames(container, frame_number)

            with record_function("process frames"):
                inputs = image_processor(list(video), return_tensors="pt")

        torch.cuda.synchronize()

        with record_function("data copy"):
            static_input.copy_(inputs["pixel_values"])

        with record_function("save pred"):
            predicted_label = static_output.argmax(-1).item()
            pred[count] = model.config.id2label[predicted_label]

        count += 1
        prof.step()


    # predict last video
    with record_function("model_inference"):
        # replay
        g.replay()

    with record_function("save pred"):
        predicted_label = static_output.argmax(-1).item()
        pred[count] = model.config.id2label[predicted_label]

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')
for key, value in pred.items():
    print(f'{key}: {value}')

  return torch.tensor(value)


Time to complete:  27.943848341000034
Time to complete with capture:  30.92820705700001
Videos processed: 50
1: flying kite
2: snowkiting
3: smoking hookah
4: high kick
5: making bed
6: sanding floor
7: throwing axe
8: pole vault
9: javelin throw
10: peeling potatoes
11: hula hooping
12: snowkiting
13: changing oil
14: high kick
15: breakdancing
16: yawning
17: getting a tattoo
18: zumba
19: opening bottle
20: feeding birds
21: shoveling snow
22: eating spaghetti
23: slacklining
24: dancing ballet
25: sailing
26: drumming fingers
27: snowboarding
28: windsurfing
29: clay pottery making
30: golf putting
31: golf putting
32: golf putting
33: diving cliff
34: high kick
35: pushing wheelchair
36: playing ukulele
37: petting animal (not cat)
38: throwing axe
39: testifying
40: climbing a rope
41: shuffling cards
42: playing didgeridoo
43: cutting watermelon
44: bobsledding
45: feeding birds
46: playing harmonica
47: unboxing
48: grooming dog
49: playing drums
50: archery
