# X3D

*Author: FAIR PyTorchVideo*

**X3D networks pretrained on the Kinetics 400 dataset**


### Example Usage

#### install dependencies

In [3]:
!pip install 'git+https://github.com/facebookresearch/fvcore'

!apt-get update
!apt-get install -y ffmpeg libsm6 libxext6

!pip install av

Collecting git+https://github.com/facebookresearch/fvcore
  Cloning https://github.com/facebookresearch/fvcore to /tmp/pip-req-build-d_58e5fa
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/fvcore /tmp/pip-req-build-d_58e5fa
  Resolved https://github.com/facebookresearch/fvcore to commit 9d9285d9cc39723578b3423aa24552926590a0c3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yacs>=0.1.6 (from fvcore==0.1.6)
  Downloading yacs-0.1.8-py3-none-any.whl.metadata (639 bytes)
Collecting iopath>=0.1.7 (from fvcore==0.1.6)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from iopath>=0.1.7->fvcore==0.1.6)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading yacs-0.1.8-py3-none-any.whl (14 kB)
Downloading portalocker-3


#### Imports

Load the model:

In [4]:
import torch
# Choose the `x3d_s` model
model_name = 'x3d_s'
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/X3D_S.pyth" to /root/.cache/torch/hub/checkpoints/X3D_S.pyth


100%|██████████| 29.4M/29.4M [00:01<00:00, 22.5MB/s]


Import remaining functions:

In [5]:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)

  return [int(c) if c.isdigit() else c for c in re.split("(\d+)", text)]


#### Setup

Set the model to eval mode and move to desired device.

In [10]:
torch.cuda.is_available()

False

In [6]:
# Set to GPU or CPU
device = "cpu"
model = model.eval()
model = model.to(device)

Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids.

In [7]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [8]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

#### Define input transform

In [9]:
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[model_name]

# Note that this transform is specific to the slow_R50 model.
# 비디오 데이터를 X3D 같은 딥러닝 모델에 입력하기 전 데이터를 일정한 규격으로 맞추기 위한 전처리
transform =  ApplyTransformToKey( # pytorchvideo.transforms.ApplyTransformToKey
    key="video",
    transform=Compose( # torchvision.transforms.Compose
        [
            UniformTemporalSubsample(transform_params["num_frames"]),  # pytorchvideo.transforms.UniformTemporalSubsample
            Lambda(lambda x: x/255.0), # torchvision.transforms.Lambda
            NormalizeVideo(mean, std), # torchvision.transforms._transforms_video.NormalizeVideo
            ShortSideScale(size=transform_params["side_size"]), #pytorchvideo.transforms.ShortSideScale
            CenterCropVideo(            # torchvision.transforms._transforms_video
                crop_size=(transform_params["crop_size"], transform_params["crop_size"])
            )
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second

#### Run Inference

Download an example video.

In [17]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

Load the video and transform it to the input format required by the model.

In [18]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path) # pytorchvideo.data.encoded_video.EncodedVideo

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data = transform(video_data)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)

In [19]:
video

<pytorchvideo.data.encoded_video_pyav.EncodedVideoPyAV at 0x7a16e0454c20>

#### Get Predictions

In [27]:
print(inputs.size()) # c, t, h, W
print(inputs[None, ...].size()) # 1, C, T, H, W

torch.Size([3, 13, 182, 182])
torch.Size([1, 3, 13, 182, 182])


In [20]:
# Pass the input clip through the model
preds = model(inputs[None, ...])

# Get the predicted classes
# 로짓 값을 클래스명으로 매핑시키기 위한 후처리.
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names)) # 양궁, 도끼 던지기, 골프 높게 치는 샷, 골프 낮게 치는 샷, 보틀 열기

Top 5 predicted labels: archery, throwing axe, golf driving, golf chipping, opening bottle


In [23]:
preds.size() # 400 = Kinetics 데이터셋의 클래스 개수

torch.Size([1, 400])

### Model Description
X3D model architectures are based on [1] pretrained on the Kinetics dataset.

| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |
| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- |  ----------- | ----------- |
| X3D      | XS    | 4x12                       | 69.12 | 88.63 | 0.91      | 3.79     |
| X3D      | S     | 13x6                       | 73.33 | 91.27 | 2.96      | 3.79     |
| X3D      | M     | 16x5                       | 75.94 | 92.72 | 6.72      | 3.79     |


### References
[1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

In [30]:
!pip list |grep torch

torch                                    2.9.0+cpu
torchao                                  0.10.0
torchaudio                               2.9.0+cpu
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.24.0+cpu


In [31]:
!pip list |grep fvcore

fvcore                                   0.1.6


In [32]:
!gcc --version

gcc (Ubuntu 11.4.0-1ubuntu1~22.04.2) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [33]:
!python --version

Python 3.12.12


In [35]:
!pip list |grep pytorchvideo

## Directly Inference on UCF-CRIME

In [40]:
print(f"clip duration: {clip_duration} (sec)")

# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
ucf_video_path="/content/Abuse001_x264_7-12.mp4"
ucf_video = EncodedVideo.from_path(ucf_video_path) # pytorchvideo.data.encoded_video.EncodedVideo

# Load the desired clip
ucf_video_data = ucf_video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
ucf_video_transformed = transform(ucf_video_data)
ucf_inputs = ucf_video_transformed["video"] # 인코딩된 비디오 텐서값만
ucf_inputs = ucf_inputs.to(device)

ucf_preds = model(ucf_inputs[None, ...]) # 배치차원 추가하여 (1, C, T, H, W)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
ucf_pred_classes = post_act(ucf_preds)
ucf_pred_top5 = ucf_pred_classes.topk(k=5).indices[0]

# Map the predicted classes to the label names
ucf_pred_top5_names = [kinetics_id_to_classname[int(i)] for i in ucf_pred_top5]
print("Top 5 predicted labels: %s" % ", ".join(ucf_pred_top5_names))

clip duration: 2.6 (sec)
Top 5 predicted labels: moving furniture, cleaning floor, hoverboarding, using remote controller (not gaming), garbage collecting


In [42]:
ucf_video_path="/content/Abuse001_x264_7-12.mp4"
ucf_video = EncodedVideo.from_path(ucf_video_path) # pytorchvideo.data.encoded_video.EncodedVideo

# Load the desired clip
ucf_video_data = ucf_video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
ucf_video_transformed = transform(ucf_video_data)
ucf_inputs = ucf_video_transformed["video"] # 인코딩된 비디오 텐서값만
ucf_inputs = ucf_inputs.to(device)

ucf_preds = model(ucf_inputs[None, ...]) # 배치차원 추가하여 (1, C, T, H, W)
ucf_pred_classes = post_act(ucf_preds)   # post_act = torch.nn.Softmax(dim=1)

# 확률값(v)과 인덱스(i) 모두 가져오기.
top5_probs, top5_indices = ucf_pred_classes.topk(k=5)

# 텐서 형태를 출력을 위한 넘파이로 변환
probs = top5_probs[0].detach().cpu().numpy()
indices = top5_indices[0].detach().cpu().numpy()

# 클래스 이름과 확률을 매핑해 출력.
print("Top 5 Predictions:")
for i in range(5):
    class_name = kinetics_id_to_classname[int(indices[i])]
    score = probs[i] * 100  # 퍼센트(%) 단위로 변환
    print(f"{i+1}: {class_name:<20} | Score: {score:.2f} (%)")

Top 5 Predictions:
1: moving furniture     | Score: 15.94 (%)
2: cleaning floor       | Score: 15.64 (%)
3: hoverboarding        | Score: 10.06 (%)
4: using remote controller (not gaming) | Score: 7.65 (%)
5: garbage collecting   | Score: 6.81 (%)
