# Facebook timeSformer

## Setup

In [1]:
import av
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import numpy as np
import time
import os

from transformers import AutoImageProcessor, TimesformerForVideoClassification
from huggingface_hub import hf_hub_download

np.random.seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
frame_number = 8

def extract_frames(container, frame_number):
    stream = container.streams.video[0]
    frame_count = stream.frames
    
    # Ensure we have at least {frame_number} frames in the video
    if frame_count < frame_number:
        raise ValueError(f"Video must have at least {frame_number} frames.")

    frames = []
    selected_indices = np.linspace(0, frame_count - 1, frame_number, dtype=int)
    
    for index in selected_indices:
        # Seek to the desired frame
        container.seek(int(index))
        for frame in container.decode(video=0):
            frame = frame.to_ndarray(format='rgb24')
            frames.append(frame)
            break  # Nur den ersten Frame nehmen
    
    return np.array(frames)

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400", device_map=device)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

## 1 Video

In [2]:
container = av.open("./videos/--_S9IDQPLg_000135_000145.mp4")

video = extract_frames(container, frame_number)

inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"].to(device)

ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
  return torch.tensor(value)


without CUDA Graph

In [3]:
start_time = time.perf_counter()
# warm up
with torch.no_grad():
    for i in range(10):
        outputs = model(inputs).logits
torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/v100/one_video/no_cuda_graph'),
    record_shapes=True,
) as prof:

    with record_function("model_inference"):
        outputs = model(inputs).logits

    torch.cuda.synchronize()
    
    prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
predicted_label = outputs.argmax(-1).item()
print(model.config.id2label[predicted_label])
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with warmup:  {end_time - start_time}')

  warn("Profiler won't be using warmup, this can skew profiler results")
STAGE:2024-03-13 13:43:26 2238609:2238609 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-03-13 13:43:26 2238609:2238609 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-03-13 13:43:26 2238609:2238609 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


javelin throw
Time to complete:  0.3743858649395406
Time to complete with warmup:  0.9020390720106661


with CUDA Graph

In [4]:
static_input = torch.randn(1, 8, 3, 224, 224, device=device)
static_input.copy_(inputs)

start_time = time.perf_counter()
# warm up
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.no_grad(), torch.cuda.stream(s):
    for i in range(10):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/v100/one_video/cuda_graph'),
    record_shapes=True,
) as prof:

    with record_function("data copy"):
        # copy data
        static_input.copy_(inputs)

    with record_function("model_inference"):
        # replay
        g.replay()

        torch.cuda.synchronize()
    prof.step()

    end_time = time.perf_counter()


# model predicts one of the 400 Kinetics-400 classes
predicted_label = static_output.argmax(-1).item()
print(model.config.id2label[predicted_label])
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')

javelin throw
Time to complete:  0.06959193898364902
Time to complete with capture:  0.5777614649850875


STAGE:2024-03-13 13:43:27 2238609:2238609 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-03-13 13:43:27 2238609:2238609 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-03-13 13:43:27 2238609:2238609 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


## Many Videos

In [5]:
folder_dir = "./videos"
videos = [os.path.join(folder_dir, file_name) for file_name in os.listdir(folder_dir) if file_name.endswith(".mp4")]

without CUDA Graph

In [6]:
pred = {}
count = 0

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"].to(device)

with torch.no_grad():
    for i in range(10):
        outputs = model(inputs).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/v100/many_videos/no_cuda_graph'),
    record_shapes=True,
) as prof:
    for i in range(len(videos)):
        with record_function("open video"):
            container = av.open(videos[i])

        with record_function("extract frames"):
            video_frames = extract_frames(container, frame_number)

        with record_function("process frames"):
            inputs = image_processor(list(video_frames), return_tensors="pt")
            inputs = inputs["pixel_values"].to(device)

        with record_function("model_inference"):
            outputs = model(inputs).logits

            torch.cuda.synchronize()

        with record_function("save pred"):
            pred[videos[i]] = model.config.id2label[outputs.argmax(-1).item()]

        count += 1
        prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')

for key, value in pred.items():
    print(f'{key}: {value}')

ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
A

Time to complete:  26.91071068914607
Time to complete with capture:  27.887550928164274
Videos processed: 50
./videos/-1mK6Npz9JA_000038_000048.mp4: playing drums
./videos/-0mnCHRQ-Zc_000092_000102.mp4: making bed
./videos/-0lqH3xAz6M_000014_000024.mp4: snowkiting
./videos/-1HT31BzADs_000118_000128.mp4: pole vault
./videos/-0MLLn0Zg1M_000015_000025.mp4: sailing
./videos/-0SoxHZp0SM_000091_000101.mp4: snowboarding
./videos/-1Kv095GbV8_000000_000010.mp4: dancing ballet
./videos/-0JU38ZQOlY_000000_000010.mp4: canoeing or kayaking
./videos/-1IlTIWPNs4_000027_000037.mp4: shoveling snow
./videos/-0WBheGENmk_000009_000019.mp4: stretching arm
./videos/-00nar1nEPc_000033_000043.mp4: playing harmonica
./videos/-0R6wpipD-c_000035_000045.mp4: eating carrots
./videos/-1brKJdL-iM_000093_000103.mp4: golf putting
./videos/-0oMsq-9b6c_000095_000105.mp4: sanding floor
./videos/-0M6S1qBn8s_000243_000253.mp4: changing oil
./videos/-0aDlftNdyw_000280_000290.mp4: cutting watermelon
./videos/-0S06ntmN_I_0000

with CUDA Graph

In [7]:
pred = {}
count = 0

static_input = torch.randn(1, frame_number, 3, 224, 224, device=device)

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"]
static_input.copy_(inputs)

s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s), torch.no_grad():
    for i in range(3):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/v100/many_videos/cuda_graph'),
    record_shapes=True,
) as prof:
    for i in range(len(videos)):
        with record_function("open video"):
            container = av.open(videos[i])

        with record_function("extract frames"):
            video = extract_frames(container, frame_number)

        with record_function("process frames"):
            inputs = image_processor(list(video), return_tensors="pt")
            inputs = inputs["pixel_values"]

        with record_function("data copy"):
            static_input.copy_(inputs)

        with record_function("model_inference"):
            g.replay()

            torch.cuda.synchronize()
        
        with record_function("save pred"):
            pred[videos[i]] = model.config.id2label[static_output.argmax(-1).item()]

        count += 1
        prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')
for key, value in pred.items():
    print(f'{key}: {value}')


ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
A

Time to complete:  25.609147548908368
Time to complete with capture:  26.487763529876247
Videos processed: 50
./videos/-1mK6Npz9JA_000038_000048.mp4: playing drums
./videos/-0mnCHRQ-Zc_000092_000102.mp4: making bed
./videos/-0lqH3xAz6M_000014_000024.mp4: snowkiting
./videos/-1HT31BzADs_000118_000128.mp4: pole vault
./videos/-0MLLn0Zg1M_000015_000025.mp4: sailing
./videos/-0SoxHZp0SM_000091_000101.mp4: snowboarding
./videos/-1Kv095GbV8_000000_000010.mp4: dancing ballet
./videos/-0JU38ZQOlY_000000_000010.mp4: canoeing or kayaking
./videos/-1IlTIWPNs4_000027_000037.mp4: shoveling snow
./videos/-0WBheGENmk_000009_000019.mp4: stretching arm
./videos/-00nar1nEPc_000033_000043.mp4: playing harmonica
./videos/-0R6wpipD-c_000035_000045.mp4: eating carrots
./videos/-1brKJdL-iM_000093_000103.mp4: golf putting
./videos/-0oMsq-9b6c_000095_000105.mp4: sanding floor
./videos/-0M6S1qBn8s_000243_000253.mp4: changing oil
./videos/-0aDlftNdyw_000280_000290.mp4: cutting watermelon
./videos/-0S06ntmN_I_000

In [8]:
pred = {}
count = 1

static_input = torch.randn(1, frame_number, 3, 224, 224, device=device)

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"]
static_input.copy_(inputs)

s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s), torch.no_grad():
    for i in range(10):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/v100/many_videos/cuda_graph_next_video'),
    record_shapes=True,
) as prof:
    
    # load first video
    with record_function("open video"):
        container = av.open(videos[0])
    
    with record_function("extract frames"):
        video = extract_frames(container, frame_number)
        
    with record_function("process frames"):
        inputs = image_processor(list(video), return_tensors="pt")

    with record_function("data copy"):
        static_input.copy_(inputs["pixel_values"])

    for i in range(1, len(videos)):
        with record_function("model_inference"):
            # replay
            g.replay()

        # load next video
        with record_function("load next video"):
            with record_function("open video"):
                container = av.open(videos[i])

            with record_function("extract frames"):
                video = extract_frames(container, frame_number)

            with record_function("process frames"):
                inputs = image_processor(list(video), return_tensors="pt")

        torch.cuda.synchronize()

        with record_function("data copy"):
            static_input.copy_(inputs["pixel_values"])
        
        with record_function("save pred"):
            predicted_label = static_output.argmax(-1).item()
            pred[count] = model.config.id2label[predicted_label]

        count += 1
        prof.step()
    

    # predict last video
    with record_function("model_inference"):
        # replay
        g.replay()

    with record_function("save pred"):
        predicted_label = static_output.argmax(-1).item()
        pred[count] = model.config.id2label[predicted_label]

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')
for key, value in pred.items():
    print(f'{key}: {value}')

ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
ALTIVEC: Color Space RGB24
A

Time to complete:  23.540598748950288
Time to complete with capture:  24.777485199039802
Videos processed: 50
1: playing drums
2: making bed
3: snowkiting
4: pole vault
5: sailing
6: snowboarding
7: dancing ballet
8: canoeing or kayaking
9: shoveling snow
10: stretching arm
11: playing harmonica
12: eating carrots
13: golf putting
14: sanding floor
15: changing oil
16: cutting watermelon
17: shuffling cards
18: high kick
19: getting a tattoo
20: peeling potatoes
21: climbing a rope
22: high kick
23: feeding birds
24: canoeing or kayaking
25: throwing axe
26: javelin throw
27: high kick
28: playing guitar
29: clay pottery making
30: yawning
31: slacklining
32: dribbling basketball
33: unboxing
34: petting animal (not cat)
35: grooming dog
36: diving cliff
37: windsurfing
38: smoking hookah
39: testifying
40: snowkiting
41: opening bottle
42: throwing axe
43: hula hooping
44: golf putting
45: feeding birds
46: archery
47: drumming fingers
48: zumba
49: pushing wheelchair
50: flying kite
