# SlowFast

*Author: FAIR PyTorchVideo*

**SlowFast networks pretrained on the Kinetics 400 dataset**


### Example Usage

#### Imports

Load the model:

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import torch
!pip install fvcore
model = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)



Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


Import remaining functions:

In [3]:
!pip install av
#!pip install fvcore
!pip install pytorchvideo

import json
import urllib
from typing import Dict
#import torch
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)





#### Define input transform

In [4]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32 # original code is 32
sampling_rate = 2 #every other frame is sampled
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3

class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        # Perform temporal sampling from the fast pathway.
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

#### Run Inference

Download an example video.

Load the video and transform it to the input format required by the model.

In [5]:
from pytorchvideo.models.slowfast import create_slowfast
model_ft = slowfast_model = create_slowfast(model_depth=50,slowfast_fusion_conv_kernel_size=(7, 1, 1))
model_ft.load_state_dict(torch.load("/content/drive/MyDrive/YAU Project/experimental/SLOWFAST_8x8_R50.pyth")['model_state'])

<All keys matched successfully>

In [6]:
import torch.nn as nn
#model_ft = torch.hub.load('facebookresearch/pytorchvideo', 'slowfast_r50', pretrained=True)
num_ftrs = model_ft.blocks[6].proj.in_features
model_ft.blocks[6].proj = nn.Linear(num_ftrs, 1245) #it might not be 1245

#### Finetuning


In [7]:
#ATTEMPT 3, MOST CURRENT ATTEMPT
import os
from torch.utils.data import Dataset, DataLoader
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Compose, Lambda, RandomCrop, Resize
from pytorchvideo.transforms import ApplyTransformToKey, UniformTemporalSubsample
import torch.optim as optim
import csv

class VideoDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.video_files = []
        self.labels = []

        with open(csv_file, newline='') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',')
            next(spamreader)
            for row in spamreader:
                video_file = row[0]
                video_id = int(row[1])
                self.video_files.append(video_file)
                self.labels.append(video_id)

    def __len__(self):
        return len(self.video_files)

    def __getitem__(self, idx):
        video_file = self.video_files[idx]
        video_path = os.path.join(self.root_dir, video_file)
        video = EncodedVideo.from_path(video_path)

        if self.transform:
            start_sec = 0
            end_sec = start_sec + clip_duration
            video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
            video_data = self.transform(video_data)

        label = self.labels[idx]

        return video_data['video'], label

dataset = VideoDataset(csv_file='/content/drive/MyDrive/YAU Project/datasets/training_dataset_v3.csv', root_dir='/content/drive/MyDrive/YAU Project/datasets/training', transform=transform)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0)


In [8]:
#initializing the testing dataset
testing_dataset = VideoDataset(csv_file='/content/drive/MyDrive/YAU Project/datasets/testing_dataset_v2.csv', root_dir='/content/drive/MyDrive/YAU Project/datasets/testing', transform=transform)
testing_data_loader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0)

In [9]:
device = "cuda"
print(device)
model_ft = model_ft.to(device)

cuda


In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_ft.parameters(), lr=0.0001)

correct = 0
# Training loop
num_epochs = 1
train_loss_list = []
test_loss_list = []
for epoch in range(num_epochs):
    model_ft.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(data_loader):
        # print(inputs[0].shape)
        # print(inputs[1].shape)
        # print(labels.shape)
        labels = labels.to(device)
        inputs[0] = inputs[0].to(device)
        inputs[1] = inputs[1].to(device)
        optimizer.zero_grad()
        outputs = model_ft(inputs)
        # print(outputs.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if i % 10 == 9:    # print every 10 mini-batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 10:.3f}")
            train_loss_list.append(running_loss / 10)
            running_loss = 0.0
            test_input, test_labels = next(iter(data_loader))
            test_input = [x.to(device) for x in test_input]
            test_labels = test_labels.to(device)


            # Compute prediction and loss
            pred = model(test_input)
            # test_loss = loss_fn(pred, test_labels).item() #computes the loss betwween the predictions and the actual labels, .item() extracts the scalar value
            test_loss = criterion(pred, test_labels).item()
            test_loss_list.append(test_loss)
            # print(test_labels)
            # print(pred.argmax(1))
            # print(pred.topk(k=10).indices[0])
            correct_batch = (pred.argmax(1) == test_labels).type(torch.float).sum().item()
            correct += correct_batch
            print(test_loss)
            print(round(correct_batch/len(pred), 3))
            # write values into csv file

print('Finished Training')

[1, 10] loss: 7.146


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 绘制训练和验证损失
X = np.arange(len(train_loss_list))
plt.plot(X, train_loss_list, label='Training Loss')
plt.plot(X, test_loss_list, label='Test Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig("./Loss_by_epoch.png", dpi=300, bbox_inches="tight")

In [None]:
torch.save(model_ft.state_dict(), './model_pt.pth')
#torch.save(optimizer.state_dict(), './optimizer.pth')

In [None]:
import pytorchvideo

model_dictionary = torch.load('/content/drive/MyDrive/YAU Project/model_checkpoint/model_pt_epoch2_5.pth')
model_ft.load_state_dict(model_dictionary) #to recreate the same graph using the way that we initially created, this

<All keys matched successfully>

In [None]:
type(model_ft)

In [None]:
#initializing the testing dataset
testing_dataset = VideoDataset(csv_file='/content/drive/MyDrive/YAU Project/datasets/testing_dataset_v2.csv', root_dir='/content/drive/MyDrive/YAU Project/datasets/testing', transform=transform)
testing_data_loader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0)

In [None]:
import os
import pandas as pd
import csv


def add_predicted_word_to_txt(pred_class):
  word_map_file = '/content/drive/MyDrive/YAU Project/datasets/word_map_v2.csv'
  word_dict = {}
  with open(word_map_file, 'r') as file:
      reader = csv.reader(file)
      for row in reader:
          #print(row[0], row[1])
          word = row[0]
          class_id = row[1]
          word_dict[class_id] = word
          #print(word_dict)  # Assuming the first column is the video name and the second is the class ID

  print(word_dict)
  predicted_word =  word_dict[pred_class]

  print(predicted_word)

  with open('/content/drive/MyDrive/YAU Project/datasets/predicted_words.txt', 'a') as file:
        file.write(predicted_word + '\n')

  print("Predicted word saved to 'predicted_word.txt'")

In [None]:
def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset) #total number of samples in the test dataset
    num_batches = len(dataloader)
    test_loss, correct = 0, 0
    model.to(device)

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad(): #disables gradient calculation to save computation
        for X, y in dataloader: #X is the abtch of input data, y is the batch of corresponding labels
            # Move inputs and labels to the specified device
            X = [x.to(device) for x in X]
            y = y.to(device)

            # Compute prediction and loss
            pred = model(X)
            test_loss += loss_fn(pred, y).item() #computes the loss betwween the predictions and the actual labels, .item() extracts the scalar value
            #print(y)
            # predicted_class = str(preds.topk(k=1).indices[0].item())
            predicted_class = str(pred.topk(k=1).indices[0].item())
            print(predicted_class)
            #print(pred.topk(k=10).indices[0])

            add_predicted_word_to_txt(predicted_class)


            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            #pred.argmax(1) == y finds max value for each row (sample), highest probability
            #.type(torch.float): Converts the boolean tensor to a float tensor where True is 1.0 and False is 0.0.
    test_loss /= num_batches #calculates the avergae los over all batches
    correct /= size #calculates accuracy as a fraction
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
test_trial_dataset = VideoDataset(csv_file='/content/drive/MyDrive/YAU Project/experimental/demo_data_loader.csv', root_dir='/content/drive/MyDrive/YAU Project/datasets/testing', transform=transform)
test_trial_data_loader = DataLoader(test_trial_dataset, batch_size=1, shuffle=True, num_workers=0)

In [None]:
criterion = nn.CrossEntropyLoss()
test_loop(test_trial_data_loader, model_ft, criterion)

1229
{'0': "j'aime", '1': 'Kg (kilogramme)', '2': 'danser', '3': 'table', '4': 'commerce', '5': 'messe', '6': 'moral', '7': 'Monaco', '8': 'salle', '9': 'sans interruption', '10': 'rat', '11': 'félicitation', '12': 'accompagner', '13': 'hiver', '14': 'enlèvement', '15': 'neveu', '16': 'enfant', '17': 'couverture', '18': "saisons (de l'année)", '19': 'téléphoner', '20': 'proposition', '21': 'commencer', '22': 'audiogramme', '23': 'se faufiler', '24': 'taureau', '25': 'fédération', '26': 'code de la route', '27': 'encéphalogramme', '28': 'chemin', '29': 'grave', '30': 'nuage', '31': 'avocat', '32': 'emprunt', '33': 'télécommunication', '34': 'email', '35': 'Orléans', '36': 'rejoindre', '37': 'fier', '38': 'rival', '39': 'symbole', '40': 'manifestation', '41': 'chaise', '42': 'débit banque', '43': 'mauvais', '44': 'il pleut', '45': 'éducateur', '46': 'Roumanie', '47': 'ségrégation', '48': 'lapin', '49': 'intervenir', '50': 'rôle', '51': "s'asseoir", '52': 'tartine', '53': 

In [None]:
video_path = '/content/drive/MyDrive/YAU Project/datasets/testing/4_synthetic_test.mp4'

# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)
# print(video['video'])

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)
#print(len(video_data['video']))
#print(video_data['video'][0].shape)
# Apply a transform to normalize the video input
video_data = transform(video_data)
#print(len(video_data['video']))
#print(video_data['video'][0].shape)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = [i.to(device)[None, ...] for i in inputs]



In [None]:
#print(inputs[0].shape)
# Pass the input clip through the model
preds = model_ft(inputs)

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_class = str(preds.topk(k=1).indices[0].item()) #item transforms the pred_class from a tensor to a number
print(pred_class)

1067


### Model Description
SlowFast model architectures are based on [1] with pretrained weights using the 8x8 setting
on the Kinetics dataset.

| arch | depth | frame length x sample rate | top 1 | top 5 | Flops (G) | Params (M) |
| --------------- | ----------- | ----------- | ----------- | ----------- | ----------- |  ----------- | ----------- |
| SlowFast | R50   | 8x8                        | 76.94 | 92.69 | 65.71     | 34.57      |
| SlowFast | R101  | 8x8                        | 77.90 | 93.27 | 127.20    | 62.83      |


### References
[1] Christoph Feichtenhofer et al, "SlowFast Networks for Video Recognition"
https://arxiv.org/pdf/1812.03982.pdf