# Facebook timeSformer

## Setup

In [1]:
import av
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import numpy as np
import time
import os

from transformers import AutoImageProcessor, TimesformerForVideoClassification
from huggingface_hub import hf_hub_download

np.random.seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
frame_number = 8

def extract_frames(container, frame_number):
    stream = container.streams.video[0]
    frame_count = stream.frames
    
    # Ensure we have at least {frame_number} frames in the video
    if frame_count < frame_number:
        raise ValueError(f"Video must have at least {frame_number} frames.")

    frames = []
    selected_indices = np.linspace(0, frame_count - 1, frame_number, dtype=int)
    
    for index in selected_indices:
        # Seek to the desired frame
        container.seek(int(index))
        for frame in container.decode(video=0):
            frame = frame.to_ndarray(format='rgb24')
            frames.append(frame)
            break  # Nur den ersten Frame nehmen
    
    return np.array(frames)

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400", device_map=device)
model.eval()



TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

## 1 Video

In [2]:
container = av.open("./videos/--_S9IDQPLg_000135_000145.mp4")

video = extract_frames(container, frame_number)

inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"].to(device)

  return torch.tensor(value)


without CUDA Graph

In [3]:
start_time = time.perf_counter()
# warm up
with torch.no_grad():
    for i in range(10):
        outputs = model(inputs).logits
torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/rtx3080/one_video/no_cuda_graph'),
    record_shapes=True,
) as prof:

    with record_function("model_inference"):
        outputs = model(inputs).logits

    torch.cuda.synchronize()
    
    prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
predicted_label = outputs.argmax(-1).item()
print(model.config.id2label[predicted_label])
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with warmup:  {end_time - start_time}')

javelin throw
Time to complete:  0.16062460699981784
Time to complete with warmup:  0.6243631479999294


  warn("Profiler won't be using warmup, this can skew profiler results")
STAGE:2024-03-13 11:42:17 10755:10755 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-03-13 11:42:17 10755:10755 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-13 11:42:17 10755:10755 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


with CUDA Graph

In [4]:
static_input = torch.randn(1, 8, 3, 224, 224, device=device)
static_input.copy_(inputs)

start_time = time.perf_counter()
# warm up
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.no_grad(), torch.cuda.stream(s):
    for i in range(10):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/rtx3080/one_video/cuda_graph'),
    record_shapes=True,
) as prof:

    with record_function("data copy"):
        # copy data
        static_input.copy_(inputs)

    with record_function("model_inference"):
        # replay
        g.replay()

        torch.cuda.synchronize()
    prof.step()

    end_time = time.perf_counter()


# model predicts one of the 400 Kinetics-400 classes
predicted_label = static_output.argmax(-1).item()
print(model.config.id2label[predicted_label])
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')

javelin throw
Time to complete:  0.048468587999877855
Time to complete with capture:  0.47260917499988864


STAGE:2024-03-13 11:42:17 10755:10755 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-03-13 11:42:17 10755:10755 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-13 11:42:17 10755:10755 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


## Many Videos

In [5]:
folder_dir = "./videos"
videos = [os.path.join(folder_dir, file_name) for file_name in os.listdir(folder_dir) if file_name.endswith(".mp4")]

without CUDA Graph

In [6]:
pred = {}
count = 0

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"].to(device)

with torch.no_grad():
    for i in range(10):
        outputs = model(inputs).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/rtx3080/many_videos/no_cuda_graph'),
    record_shapes=True,
) as prof:
    for i in range(len(videos)):
        with record_function("open video"):
            container = av.open(videos[i])

        with record_function("extract frames"):
            video_frames = extract_frames(container, frame_number)

        with record_function("process frames"):
            inputs = image_processor(list(video_frames), return_tensors="pt")
            inputs = inputs["pixel_values"].to(device)

        with record_function("model_inference"):
            outputs = model(inputs).logits

            torch.cuda.synchronize()

        with record_function("save pred"):
            pred[videos[i]] = model.config.id2label[outputs.argmax(-1).item()]

        count += 1
        prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')

for key, value in pred.items():
    print(f'{key}: {value}')

STAGE:2024-03-13 11:42:20 10755:10755 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-03-13 11:42:21 10755:10755 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-13 11:42:21 10755:10755 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Time to complete:  10.704028231999928
Time to complete with capture:  11.19308719799983
Videos processed: 50
./videos/-1lKx46x_18_000147_000157.mp4: drumming fingers
./videos/-0mnCHRQ-Zc_000092_000102.mp4: making bed
./videos/-00nar1nEPc_000033_000043.mp4: playing harmonica
./videos/-0M6S1qBn8s_000243_000253.mp4: changing oil
./videos/-1GRj5UvVBA_000025_000035.mp4: testifying
./videos/-0-ukHRelxA_000015_000025.mp4: petting animal (not cat)
./videos/-1HT31BzADs_000118_000128.mp4: pole vault
./videos/-0HRnFhCDdc_000026_000036.mp4: grooming dog
./videos/-1mK6Npz9JA_000038_000048.mp4: playing drums
./videos/-1IlTIWPNs4_000027_000037.mp4: shoveling snow
./videos/-0H3T2B9PH4_000005_000015.mp4: pushing wheelchair
./videos/-0cOo0cRVZU_000008_000018.mp4: high kick
./videos/-0yuyrbruYM_000079_000089.mp4: flying kite
./videos/-01cbva4erQ_000007_000017.mp4: clay pottery making
./videos/-0R6wpipD-c_000035_000045.mp4: eating spaghetti
./videos/-1dWGnhjB2A_000049_000059.mp4: playing ukulele
./videos/

with CUDA Graph

In [7]:
pred = {}
count = 0

static_input = torch.randn(1, frame_number, 3, 224, 224, device=device)

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"]
static_input.copy_(inputs)

s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s), torch.no_grad():
    for i in range(3):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/rtx3080/many_videos/cuda_graph'),
    record_shapes=True,
) as prof:
    for i in range(len(videos)):
        with record_function("open video"):
            container = av.open(videos[i])

        with record_function("extract frames"):
            video = extract_frames(container, frame_number)

        with record_function("process frames"):
            inputs = image_processor(list(video), return_tensors="pt")
            inputs = inputs["pixel_values"]

        with record_function("data copy"):
            static_input.copy_(inputs)

        with record_function("model_inference"):
            g.replay()

            torch.cuda.synchronize()
        
        with record_function("save pred"):
            pred[videos[i]] = model.config.id2label[static_output.argmax(-1).item()]

        count += 1
        prof.step()

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')
for key, value in pred.items():
    print(f'{key}: {value}')


STAGE:2024-03-13 11:42:31 10755:10755 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-03-13 11:42:32 10755:10755 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-13 11:42:32 10755:10755 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Time to complete:  10.041801956999961
Time to complete with capture:  10.431655581999848
Videos processed: 50
./videos/-1lKx46x_18_000147_000157.mp4: drumming fingers
./videos/-0mnCHRQ-Zc_000092_000102.mp4: making bed
./videos/-00nar1nEPc_000033_000043.mp4: playing harmonica
./videos/-0M6S1qBn8s_000243_000253.mp4: changing oil
./videos/-1GRj5UvVBA_000025_000035.mp4: testifying
./videos/-0-ukHRelxA_000015_000025.mp4: petting animal (not cat)
./videos/-1HT31BzADs_000118_000128.mp4: pole vault
./videos/-0HRnFhCDdc_000026_000036.mp4: grooming dog
./videos/-1mK6Npz9JA_000038_000048.mp4: playing drums
./videos/-1IlTIWPNs4_000027_000037.mp4: shoveling snow
./videos/-0H3T2B9PH4_000005_000015.mp4: pushing wheelchair
./videos/-0cOo0cRVZU_000008_000018.mp4: high kick
./videos/-0yuyrbruYM_000079_000089.mp4: flying kite
./videos/-01cbva4erQ_000007_000017.mp4: clay pottery making
./videos/-0R6wpipD-c_000035_000045.mp4: eating spaghetti
./videos/-1dWGnhjB2A_000049_000059.mp4: playing ukulele
./videos

In [8]:
pred = {}
count = 1

static_input = torch.randn(1, frame_number, 3, 224, 224, device=device)

start_time = time.perf_counter()
# warm up
container = av.open(videos[0])
video = extract_frames(container, frame_number)
inputs = image_processor(list(video), return_tensors="pt")
inputs = inputs["pixel_values"]
static_input.copy_(inputs)

s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s), torch.no_grad():
    for i in range(10):
        static_output = model(static_input).logits
torch.cuda.current_stream().wait_stream(s)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = model(static_input).logits

torch.cuda.synchronize()


start_time1 = time.perf_counter()
benchmark_schedule = torch.profiler.schedule(wait=5, warmup=3, active=5, repeat=1)
with torch.no_grad(), torch.profiler.profile(
    schedule=benchmark_schedule,
    on_trace_ready=torch.profiler.tensorboard_trace_handler(f'./log/rtx3080/many_videos/cuda_graph_next_video'),
    record_shapes=True,
) as prof:
    
    # load first video
    with record_function("open video"):
        container = av.open(videos[0])
    
    with record_function("extract frames"):
        video = extract_frames(container, frame_number)
        
    with record_function("process frames"):
        inputs = image_processor(list(video), return_tensors="pt")

    with record_function("data copy"):
        static_input.copy_(inputs["pixel_values"])

    for i in range(1, len(videos)):
        with record_function("model_inference"):
            # replay
            g.replay()

        # load next video
        with record_function("load next video"):
            with record_function("open video"):
                container = av.open(videos[i])

            with record_function("extract frames"):
                video = extract_frames(container, frame_number)

            with record_function("process frames"):
                inputs = image_processor(list(video), return_tensors="pt")

        torch.cuda.synchronize()

        with record_function("data copy"):
            static_input.copy_(inputs["pixel_values"])
        
        with record_function("save pred"):
            predicted_label = static_output.argmax(-1).item()
            pred[count] = model.config.id2label[predicted_label]

        count += 1
        prof.step()
    

    # predict last video
    with record_function("model_inference"):
        # replay
        g.replay()

    with record_function("save pred"):
        predicted_label = static_output.argmax(-1).item()
        pred[count] = model.config.id2label[predicted_label]

    end_time = time.perf_counter()

# model predicts one of the 400 Kinetics-400 classes
print(f'Time to complete:  {end_time - start_time1}')
print(f'Time to complete with capture:  {end_time - start_time}')
print(f'Videos processed: {count}')
for key, value in pred.items():
    print(f'{key}: {value}')

STAGE:2024-03-13 11:42:41 10755:10755 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-03-13 11:42:42 10755:10755 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-13 11:42:42 10755:10755 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Time to complete:  8.27009914700011
Time to complete with capture:  8.937778688000208
Videos processed: 50
1: drumming fingers
2: making bed
3: playing harmonica
4: changing oil
5: testifying
6: petting animal (not cat)
7: pole vault
8: grooming dog
9: playing drums
10: shoveling snow
11: pushing wheelchair
12: high kick
13: flying kite
14: clay pottery making
15: eating spaghetti
16: playing ukulele
17: bobsledding
18: golf putting
19: playing didgeridoo
20: snowkiting
21: dancing ballet
22: sailing
23: cutting watermelon
24: high kick
25: smoking hookah
26: hula hooping
27: yawning
28: slacklining
29: throwing axe
30: sanding floor
31: peeling potatoes
32: snowkiting
33: throwing axe
34: feeding birds
35: shuffling cards
36: zumba
37: golf putting
38: golf putting
39: opening bottle
40: getting a tattoo
41: snowboarding
42: javelin throw
43: breakdancing
44: feeding birds
45: diving cliff
46: high kick
47: unboxing
48: climbing a rope
49: windsurfing
50: archery
