# Finetune the X3D

* Original Author: FAIR PyTorchVideo*

> I start with X3D network pretrained on the Kinetics 400 dataset and finetune it on the UCF-CRIME dataset for video anomaly detection.


#### install dependencies

In [23]:
!pip install 'git+https://github.com/facebookresearch/fvcore'

!apt-get update
!apt-get install -y ffmpeg libsm6 libxext6

!pip install av

Collecting git+https://github.com/facebookresearch/fvcore
  Cloning https://github.com/facebookresearch/fvcore to /tmp/pip-req-build-h6mc7_f3
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/fvcore /tmp/pip-req-build-h6mc7_f3
  Resolved https://github.com/facebookresearch/fvcore to commit 9d9285d9cc39723578b3423aa24552926590a0c3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.laun


#### Imports

Load the model:

In [1]:
import torch
# Choose the `x3d_s` model to load
model_name = 'x3d_m'
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)

Downloading: "https://github.com/facebookresearch/pytorchvideo/zipball/main" to /home/etri/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/X3D_M.pyth" to /home/etri/.cache/torch/hub/checkpoints/X3D_M.pyth
100%|██████████| 29.4M/29.4M [00:27<00:00, 1.10MB/s]


In [2]:
model

Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv2plus1d(
        (conv_t): Conv3d(3, 24, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
        (conv_xy): Conv3d(24, 24, kernel_size=(5, 1, 1), stride=(1, 1, 1), padding=(2, 0, 0), groups=24, bias=False)
      )
      (norm): BatchNorm3d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(24, 24, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(24, 54, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
            (norm_a): BatchNorm3d(54, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(54, 54, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), groups=54, bias=False)
            (nor

#### Original Classes for UCF-CRIME (Abnormal 13 + Normal 1)

In [3]:
print(model.blocks[5]==model.blocks[-1])
model.blocks[-1]

True


ResNetBasicHead(
  (pool): ProjectedPool(
    (pre_conv): Conv3d(192, 432, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
    (pre_norm): BatchNorm3d(432, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (pre_act): ReLU()
    (pool): AvgPool3d(kernel_size=(16, 7, 7), stride=1, padding=0)
    (post_conv): Conv3d(432, 2048, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
    (post_act): ReLU()
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (proj): Linear(in_features=2048, out_features=400, bias=True)
  (output_pool): AdaptiveAvgPool3d(output_size=1)
)

In [5]:
num_classes = 14 # UCF-CRIME 클래스 개수

in_features = model.blocks[5].proj.in_features # 기존 레이어의 입력 피처 크기 추출

# model.blocks[5].proj = torch.nn.Linear(model.blocks[5].proj.in_features, num_classes)
model.blocks[5].proj = torch.nn.Linear(in_features, num_classes) # Final Linear 레이어만 교체 (*가장 쉬운 가중치 보존 방법)

In [6]:
in_features

2048

> $y = W x + b$

- $x \in \mathbb{R}^{2048}$ : 풀링을 가친 특징 벡터 (여기선 2048 차원)

- $W \in \mathbb{R}^{N \times 2048}$ : 학습시켜야할 새 가중치 행렬

Import remaining functions:

In [7]:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)



#### Setup

Set the model to eval mode and move to desired device.

In [8]:
torch.cuda.is_available()

True

In [None]:
# Set to GPU or CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Selected Device is {device}\n")
model = model.eval()
model = model.to(device)

Selected Device is cuda


NVIDIA GeForce RTX 5090 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_70 sm_75 sm_80 sm_86 sm_90.
If you want to use the NVIDIA GeForce RTX 5090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



#### Define input transform

In [10]:
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
frames_per_second = 30
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 256,
        "train_crop_size": 224,
        "test_crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    },
    "x3d_l": {
        "side_size": 356,
        "train_crop_size": 312,
        "test_crop_size": 356,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}

# Get transform parameters based on model
transform_params = model_transform_params[model_name]

#### Kinetic Dataset Label(.json) and a sample video

Download the id to label mapping for the Kinetics 400 dataset on which the torch hub models were trained. This will be used to get the category label names from the predicted class ids.

In [32]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [33]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [34]:
kinetics_id_to_classname

{290: 'sharpening knives',
 115: 'eating ice cream',
 81: 'cutting nails',
 53: 'changing wheel',
 19: 'bench pressing',
 88: 'deadlifting',
 111: 'eating carrots',
 192: 'marching',
 358: 'throwing discus',
 231: 'playing flute',
 72: 'cooking on campfire',
 33: 'breading or breadcrumbing',
 218: 'playing badminton',
 276: 'ripping paper',
 244: 'playing saxophone',
 197: 'milking cow',
 169: 'juggling balls',
 130: 'flying kite',
 43: 'capoeira',
 187: 'making jewelry',
 100: 'drinking',
 228: 'playing cymbals',
 61: 'cleaning gutters',
 161: 'hurling (sport)',
 239: 'playing organ',
 361: 'tossing coin',
 395: 'wrestling',
 103: 'driving car',
 150: 'headbutting',
 147: 'gymnastics tumbling',
 186: 'making bed',
 0: 'abseiling',
 155: 'holding snake',
 278: 'rock climbing',
 71: 'cooking egg',
 182: 'long jump',
 17: 'bee keeping',
 365: 'trimming or shaving beard',
 63: 'cleaning shoes',
 86: 'dancing gangnam style',
 50: 'catching or throwing softball',
 164: 'ice skating',
 168: 

#### **UCF-Crime Data Formatting**

In [29]:
# Simply inspect the dataset
import cv2
import os

# wsl 경로에 유의
vid_sample_path = r"/mnt/c/KJM/abnormal_behavior/DB/UCF_Crimes/Videos/train/Arson/Arson002_x264.mp4"
trimmed_sample_path = r"/mnt/c/KJM/abnormal_behavior/DB/UCF_Crimes/Action_Regnition_splits(classifiction)/test_001_trimmed_3fps_16sec/Abuse/Abuse001_x264_trimmed.mp4"

vid_sample_path = trimmed_sample_path
cap = cv2.VideoCapture(vid_sample_path)
if not os.path.exists(vid_sample_path):
    print("경로에 파일이 없습니다. 경로를 다시 확인하세요.")
else: 
    print(f"Resolution of trimmed videos is {cap.get(cv2.CAP_PROP_FRAME_WIDTH)} x {cap.get(cv2.CAP_PROP_FRAME_HEIGHT)}")
    print(f"FPS of trimmed videos is {cap.get(cv2.CAP_PROP_FPS)}")

Resolution of trimmed videos is 320.0 x 240.0
FPS of trimmed videos is 3.0


In [12]:
# 비디오 전처리: X3D 모델 입력 전, 데이터를 일정한 규격으로 맞추기 위한 전처리
transform =  ApplyTransformToKey( # pytorchvideo.transforms.ApplyTransformToKey
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(transform_params["num_frames"]),  # 균등한 간격으로 추출할 프레임 개수
            Lambda(lambda x: x/255.0), # 0~255 범위의 픽셀값을 0~1로 정규화
            NormalizeVideo(mean, std), # Kinetics로 사전학습한 픽셀 값 평균과 표준편차로 채널 정규화
            ShortSideScale(size=transform_params["side_size"]), # 가로/세로 중 짧은 변 기준 크롭
            CenterCropVideo(                                    # 앞서 크롭된 프레임 중앙에서 정방향 크롭
                crop_size=(transform_params["train_crop_size"], transform_params["train_crop_size"])
            )
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second

print(f"Calculated clip duration is {clip_duration}")

Calculated clip duration is 2.6666666666666665


In [None]:
from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler
from torch.utils.data import DataLoader

# 비디오 데이터 로더 정의
# def create_dataloader(data_path, clip_duration, batch_size):
#   dataset = LabeledVideoDataset(
#       data_path,
#       clip_sampler=make_clip_sampler("random", clip_duration),
#       decode_audio=False,
#       transform=transform
#   ) *** LabeledVideoDataset 클래스는 폴더 전체를 하나의 클래스로 보고 랜덤하게 클립 추출하므로 실제 이벤트 구간이 명시된 데이터셋을 학습하기에는 부적절.
#   return DataLoader(dataset, batch_size=batch_size)

train_dataloader(data_path, clip_duration, batch_size=32)

NameError: name 'train_dataloader' is not defined

In [38]:
# 학습 함수 정의
def train_one_epoch(model, data_loader, criterion, optimizer, device):
  model.train()
  total_loss =0
  for batch in data_loader:
    inputs, labels = batch["video"], batch["label"]
    inputs = inputs.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  return total_loss / len(data_loader)

#### Finetune on UCF-CRIME

#### Start Training

##### 손실함수 및 옵티마이저

- 차등 학습률 (Differential Learning Rate)

  > UCF-CRIME 다중 클래스 분류를 위한 Cross Entropy Loss 사용 <br>이때 교체한 출력헤드에 대해서만 10배 가량 큰 학습률 설정 <br>(완전 램덤 상태에서 출발하기 때문, 학술적으로 권장되는 전이 학습 전략)

##### 사전학습 가중치 동결

- **Phase 1:** Backbone(blocks[0:5])을 동결하고 새로 만든 Head만 2~3 Epoch 정도 학습. <br>(무작위 가중치가 사전 학습된 Backbone을 망치는 것을 방지)
- **Phase 2:** 모든 레이어 동결을 해제하고 전체 네트워크를 파인튠.

In [None]:
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Selected device: {device}")
model.to(device)
criterion = nn.CrossEntropyLoss()

# phase 1
print("Starting Phase 1: Training Head Only...")
for param in model.blocks[:5].parameters():
  param.requires_grad = False

optimizer_p1 = optim.SGD(model.blocks[5].parameters(), lr=1e-3, momentum=0.9)
for epoch in range(2):
  train_loss = train_one_epoch(model, train_dataloader, criterion, optimizer_p1, device)
  print(f"Phase 1 - Epoch {epoch}: Loss {train_loss:.4f}")

# phase 2
print("Starting Phase 2: Full Fine-tuning...")
for param in model.parameters():
    param.requires_grad = True

# Head와 backbone의 학습률을 서로 다르게 적용하기 위해 분리
optimizer_p2 = optim.SGD([
    {'params': model.blocks[:5].proj.parameters(), 'lr':1e-5},
    {'params': model.blocks[5].proj.parameters(), 'lr':1e-3} # 빠른 수렴 유발
], momentum=0.9, weight_decay=1e-4)

for epoch in range(5):
  loss = train_one_epoch(model, train_dataloader, criterion, optimizer_p2, device)
  print(f"Phase 2 - Epoch {epoch+1}, Loss: {loss:.4f}")

#### Run Inference

Download an example video.

In [17]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

Load the video and transform it to the input format required by the model.

In [18]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path) # pytorchvideo.data.encoded_video.EncodedVideo

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data = transform(video_data)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)

In [19]:
video

<pytorchvideo.data.encoded_video_pyav.EncodedVideoPyAV at 0x7a16e0454c20>

#### Get Predictions

In [27]:
print(inputs.size()) # c, t, h, W
print(inputs[None, ...].size()) # 1, C, T, H, W

torch.Size([3, 13, 182, 182])
torch.Size([1, 3, 13, 182, 182])


In [20]:
# Pass the input clip through the model
preds = model(inputs[None, ...])

# Get the predicted classes
# 로짓 값을 클래스명으로 매핑시키기 위한 후처리.
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names)) # 양궁, 도끼 던지기, 골프 높게 치는 샷, 골프 낮게 치는 샷, 보틀 열기

Top 5 predicted labels: archery, throwing axe, golf driving, golf chipping, opening bottle


In [23]:
preds.size() # 400 = Kinetics 데이터셋의 클래스 개수

torch.Size([1, 400])

### Model Description
X3D model architectures are based on [1] pretrained on the Kinetics dataset.

| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |
| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- |  ----------- | ----------- |
| X3D      | XS    | 4x12                       | 69.12 | 88.63 | 0.91      | 3.79     |
| X3D      | S     | 13x6                       | 73.33 | 91.27 | 2.96      | 3.79     |
| X3D      | M     | 16x5                       | 75.94 | 92.72 | 6.72      | 3.79     |


### References
[1] Christoph Feichtenhofer, "X3D: Expanding Architectures for
    Efficient Video Recognition." https://arxiv.org/abs/2004.04730

In [30]:
!pip list |grep torch

torch                                    2.9.0+cpu
torchao                                  0.10.0
torchaudio                               2.9.0+cpu
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.24.0+cpu


In [31]:
!pip list |grep fvcore

fvcore                                   0.1.6


In [32]:
!gcc --version

gcc (Ubuntu 11.4.0-1ubuntu1~22.04.2) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [33]:
!python --version

Python 3.12.12


In [35]:
!pip list |grep pytorchvideo

## Directly Inference on UCF-CRIME

In [40]:
print(f"clip duration: {clip_duration} (sec)")

# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
ucf_video_path="/content/Abuse001_x264_7-12.mp4"
ucf_video = EncodedVideo.from_path(ucf_video_path) # pytorchvideo.data.encoded_video.EncodedVideo

# Load the desired clip
ucf_video_data = ucf_video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
ucf_video_transformed = transform(ucf_video_data)
ucf_inputs = ucf_video_transformed["video"] # 인코딩된 비디오 텐서값만
ucf_inputs = ucf_inputs.to(device)

ucf_preds = model(ucf_inputs[None, ...]) # 배치차원 추가하여 (1, C, T, H, W)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
ucf_pred_classes = post_act(ucf_preds)
ucf_pred_top5 = ucf_pred_classes.topk(k=5).indices[0]

# Map the predicted classes to the label names
ucf_pred_top5_names = [kinetics_id_to_classname[int(i)] for i in ucf_pred_top5]
print("Top 5 predicted labels: %s" % ", ".join(ucf_pred_top5_names))

clip duration: 2.6 (sec)
Top 5 predicted labels: moving furniture, cleaning floor, hoverboarding, using remote controller (not gaming), garbage collecting


In [42]:
ucf_video_path="/content/Abuse001_x264_7-12.mp4"
ucf_video = EncodedVideo.from_path(ucf_video_path) # pytorchvideo.data.encoded_video.EncodedVideo

# Load the desired clip
ucf_video_data = ucf_video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
ucf_video_transformed = transform(ucf_video_data)
ucf_inputs = ucf_video_transformed["video"] # 인코딩된 비디오 텐서값만
ucf_inputs = ucf_inputs.to(device)

ucf_preds = model(ucf_inputs[None, ...]) # 배치차원 추가하여 (1, C, T, H, W)
ucf_pred_classes = post_act(ucf_preds)   # post_act = torch.nn.Softmax(dim=1)

# 확률값(v)과 인덱스(i) 모두 가져오기.
top5_probs, top5_indices = ucf_pred_classes.topk(k=5)

# 텐서 형태를 출력을 위한 넘파이로 변환
probs = top5_probs[0].detach().cpu().numpy()
indices = top5_indices[0].detach().cpu().numpy()

# 클래스 이름과 확률을 매핑해 출력.
print("Top 5 Predictions:")
for i in range(5):
    class_name = kinetics_id_to_classname[int(indices[i])]
    score = probs[i] * 100  # 퍼센트(%) 단위로 변환
    print(f"{i+1}: {class_name:<20} | Score: {score:.2f} (%)")

Top 5 Predictions:
1: moving furniture     | Score: 15.94 (%)
2: cleaning floor       | Score: 15.64 (%)
3: hoverboarding        | Score: 10.06 (%)
4: using remote controller (not gaming) | Score: 7.65 (%)
5: garbage collecting   | Score: 6.81 (%)
