In [1]:
import torch
import os
import cv2
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchvision.models.video import mvit_v2_s, MViT_V2_S_Weights
from PIL import Image
import torch.nn.functional as F

In [2]:
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, transform=None, max_frames=16):
        self.data_dir = data_dir
        self.classes = sorted(os.listdir(data_dir))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
        self.video_paths = self._get_video_paths()
        self.transform = transform
        self.max_frames = max_frames

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path, label = self.video_paths[idx]
        frames = self.load_frames(video_path)
        if self.max_frames is not None:
            frames = self._process_frames(frames)
        if self.transform:
            frames = [self.transform(frame) for frame in frames]
        video_tensor = torch.stack(frames, dim=0)
        return video_tensor.permute(1,0,2,3), label

    def _get_video_paths(self):
        video_paths = []
        for class_name in self.classes:
            class_dir = os.path.join(self.data_dir, class_name)
            for video_name in os.listdir(class_dir):
                video_path = os.path.join(class_dir, video_name)
                label = self.class_to_idx[class_name]
                video_paths.append((video_path, label))
        return video_paths

    def load_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  
            pil_image = Image.fromarray(frame)  
            frames.append(pil_image)
        cap.release()
        return frames

    def _process_frames(self, frames):
        if len(frames) > self.max_frames:
            # Trim frames if more than max_frames
            frames = frames[:self.max_frames]
        elif len(frames) < self.max_frames:
            # Pad frames if less than max_frames
            num_to_pad = self.max_frames - len(frames)
            pad_width = [(0, num_to_pad)] + [(0, 0)] * (len(frames[0].shape) - 1)
            frames.extend([F.pad(frame, pad_width, value=0) for frame in frames[-1:]])
        return frames


In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  #
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
])
# Define paths to your data directory
batch_size=2
data_dir = 'data'

# Load your custom video dataset
train_dataset = VideoDataset(data_dir, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



In [4]:
a = next(iter(train_loader))

In [5]:
a[0].shape

torch.Size([2, 3, 16, 224, 224])

In [6]:
model = mvit_v2_s(pretrained=True, weights=MViT_V2_S_Weights)
model



MViT(
  (conv_proj): Conv3d(3, 96, kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))
  (pos_encoding): PositionalEncoding()
  (blocks): ModuleList(
    (0): MultiscaleBlock(
      (norm1): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (norm2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (attn): MultiscaleAttention(
        (qkv): Linear(in_features=96, out_features=288, bias=True)
        (project): Sequential(
          (0): Linear(in_features=96, out_features=96, bias=True)
        )
        (pool_q): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          )
        )
        (pool_k): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,

In [7]:
model.head = nn.Sequential(
    nn.Dropout(p=0.25, inplace=True),
    nn.Linear(in_features=768, out_features=2),  
    nn.Softmax(dim=1)  
)

In [8]:
model

MViT(
  (conv_proj): Conv3d(3, 96, kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))
  (pos_encoding): PositionalEncoding()
  (blocks): ModuleList(
    (0): MultiscaleBlock(
      (norm1): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (norm2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (attn): MultiscaleAttention(
        (qkv): Linear(in_features=96, out_features=288, bias=True)
        (project): Sequential(
          (0): Linear(in_features=96, out_features=96, bias=True)
        )
        (pool_q): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          )
        )
        (pool_k): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,

In [9]:
model.to("cuda")

MViT(
  (conv_proj): Conv3d(3, 96, kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))
  (pos_encoding): PositionalEncoding()
  (blocks): ModuleList(
    (0): MultiscaleBlock(
      (norm1): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (norm2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (attn): MultiscaleAttention(
        (qkv): Linear(in_features=96, out_features=288, bias=True)
        (project): Sequential(
          (0): Linear(in_features=96, out_features=96, bias=True)
        )
        (pool_q): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          )
        )
        (pool_k): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,

In [10]:
criterion = nn.CrossEntropyLoss(weights=torch.tensor([1,1.4]))
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [11]:
val_dir = "test"
val_dataset = VideoDataset(val_dir, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [12]:
from tqdm import tqdm
num_epochs = 10

In [13]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0
    for videos, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Training'):
        videos, labels = videos.to("cuda"), labels.to("cuda")
        optimizer.zero_grad()
        outputs = model(videos)
        loss = criterion(outputs.to(), labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        correct_train += predicted.eq(labels).sum().item()
        total_train += labels.size(0)
    train_accuracy = 100 * correct_train / total_train
    train_loss /= len(train_loader)
    
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for videos, labels in tqdm(val_loader, desc=f'Epoch {epoch + 1}/{num_epochs} - Validation'):
            videos, labels = videos.to("cuda"), labels.to("cuda")
            outputs = model(videos)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            correct_val += predicted.eq(labels).sum().item()
            total_val += labels.size(0)
    val_accuracy = 100 * correct_val / total_val
    val_loss /= len(val_loader)
    
    # Print epoch statistics
    print(f'Epoch {epoch + 1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')


Epoch 1/100 - Training: 100%|██████████| 145/145 [04:03<00:00,  1.68s/it]
Epoch 1/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.03s/it]


Epoch 1/100, Train Loss: 0.5834, Train Accuracy: 74.05%, Val Loss: 0.3254, Val Accuracy: 100.00%


Epoch 2/100 - Training: 100%|██████████| 145/145 [04:02<00:00,  1.67s/it]
Epoch 2/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.03s/it]


Epoch 2/100, Train Loss: 0.3373, Train Accuracy: 97.92%, Val Loss: 0.3149, Val Accuracy: 100.00%


Epoch 3/100 - Training: 100%|██████████| 145/145 [04:02<00:00,  1.67s/it]
Epoch 3/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 3/100, Train Loss: 0.3172, Train Accuracy: 100.00%, Val Loss: 0.3141, Val Accuracy: 100.00%


Epoch 4/100 - Training: 100%|██████████| 145/145 [03:59<00:00,  1.66s/it]
Epoch 4/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 4/100, Train Loss: 0.3567, Train Accuracy: 95.50%, Val Loss: 0.3147, Val Accuracy: 100.00%


Epoch 5/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 5/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.03s/it]


Epoch 5/100, Train Loss: 0.3344, Train Accuracy: 98.27%, Val Loss: 0.3139, Val Accuracy: 100.00%


Epoch 6/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 6/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 6/100, Train Loss: 0.3142, Train Accuracy: 100.00%, Val Loss: 0.3136, Val Accuracy: 100.00%


Epoch 7/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 7/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 7/100, Train Loss: 0.3140, Train Accuracy: 100.00%, Val Loss: 0.3135, Val Accuracy: 100.00%


Epoch 8/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 8/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 8/100, Train Loss: 0.3137, Train Accuracy: 100.00%, Val Loss: 0.3135, Val Accuracy: 100.00%


Epoch 9/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 9/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 9/100, Train Loss: 0.3137, Train Accuracy: 100.00%, Val Loss: 0.3135, Val Accuracy: 100.00%


Epoch 10/100 - Training: 100%|██████████| 145/145 [04:02<00:00,  1.67s/it]
Epoch 10/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.01s/it]


Epoch 10/100, Train Loss: 0.3137, Train Accuracy: 100.00%, Val Loss: 0.3134, Val Accuracy: 100.00%


Epoch 11/100 - Training: 100%|██████████| 145/145 [03:59<00:00,  1.66s/it]
Epoch 11/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.01s/it]


Epoch 11/100, Train Loss: 0.3333, Train Accuracy: 98.27%, Val Loss: 0.3884, Val Accuracy: 92.86%


Epoch 12/100 - Training: 100%|██████████| 145/145 [04:01<00:00,  1.66s/it]
Epoch 12/100 - Validation: 100%|██████████| 28/28 [00:29<00:00,  1.04s/it]


Epoch 12/100, Train Loss: 0.3980, Train Accuracy: 91.00%, Val Loss: 0.3850, Val Accuracy: 92.86%


Epoch 13/100 - Training: 100%|██████████| 145/145 [04:03<00:00,  1.68s/it]
Epoch 13/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 13/100, Train Loss: 0.3827, Train Accuracy: 93.08%, Val Loss: 0.3849, Val Accuracy: 92.86%


Epoch 14/100 - Training: 100%|██████████| 145/145 [03:59<00:00,  1.65s/it]
Epoch 14/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 14/100, Train Loss: 0.3825, Train Accuracy: 93.08%, Val Loss: 0.3848, Val Accuracy: 92.86%


Epoch 15/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 15/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 15/100, Train Loss: 0.3824, Train Accuracy: 93.08%, Val Loss: 0.3848, Val Accuracy: 92.86%


Epoch 16/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 16/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 16/100, Train Loss: 0.3825, Train Accuracy: 93.08%, Val Loss: 0.3848, Val Accuracy: 92.86%


Epoch 17/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 17/100 - Validation: 100%|██████████| 28/28 [00:29<00:00,  1.05s/it]


Epoch 17/100, Train Loss: 0.3824, Train Accuracy: 93.08%, Val Loss: 0.3848, Val Accuracy: 92.86%


Epoch 18/100 - Training: 100%|██████████| 145/145 [04:03<00:00,  1.68s/it]
Epoch 18/100 - Validation: 100%|██████████| 28/28 [00:29<00:00,  1.04s/it]


Epoch 18/100, Train Loss: 0.3824, Train Accuracy: 93.08%, Val Loss: 0.3848, Val Accuracy: 92.86%


Epoch 19/100 - Training: 100%|██████████| 145/145 [04:05<00:00,  1.69s/it]
Epoch 19/100 - Validation: 100%|██████████| 28/28 [00:29<00:00,  1.04s/it]


Epoch 19/100, Train Loss: 0.3823, Train Accuracy: 93.08%, Val Loss: 0.3848, Val Accuracy: 92.86%


Epoch 20/100 - Training: 100%|██████████| 145/145 [04:02<00:00,  1.67s/it]
Epoch 20/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 20/100, Train Loss: 0.3823, Train Accuracy: 93.08%, Val Loss: 0.3847, Val Accuracy: 92.86%


Epoch 21/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 21/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 21/100, Train Loss: 0.3823, Train Accuracy: 93.08%, Val Loss: 0.3847, Val Accuracy: 92.86%


Epoch 22/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 22/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.03s/it]


Epoch 22/100, Train Loss: 0.3823, Train Accuracy: 93.08%, Val Loss: 0.3847, Val Accuracy: 92.86%


Epoch 23/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 23/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 23/100, Train Loss: 0.3826, Train Accuracy: 93.08%, Val Loss: 0.3847, Val Accuracy: 92.86%


Epoch 24/100 - Training: 100%|██████████| 145/145 [03:59<00:00,  1.65s/it]
Epoch 24/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 24/100, Train Loss: 0.3823, Train Accuracy: 93.08%, Val Loss: 0.3847, Val Accuracy: 92.86%


Epoch 25/100 - Training: 100%|██████████| 145/145 [04:00<00:00,  1.66s/it]
Epoch 25/100 - Validation: 100%|██████████| 28/28 [00:28<00:00,  1.02s/it]


Epoch 25/100, Train Loss: 0.3858, Train Accuracy: 92.73%, Val Loss: 0.3847, Val Accuracy: 92.86%


Epoch 26/100 - Training: 100%|██████████| 145/145 [04:03<00:00,  1.68s/it]
Epoch 26/100 - Validation:  46%|████▋     | 13/28 [00:14<00:16,  1.08s/it]


KeyboardInterrupt: 

In [None]:
model.to("cpu")
torch.save({
    'model_state_dict': model.state_dict(),
    'model_architecture': model,
}, 'mvit2.pth')

In [None]:
model.to("cuda")

MViT(
  (conv_proj): Conv3d(3, 96, kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3))
  (pos_encoding): PositionalEncoding()
  (blocks): ModuleList(
    (0): MultiscaleBlock(
      (norm1): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (norm2): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
      (attn): MultiscaleAttention(
        (qkv): Linear(in_features=96, out_features=288, bias=True)
        (project): Sequential(
          (0): Linear(in_features=96, out_features=96, bias=True)
        )
        (pool_q): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,), eps=1e-06, elementwise_affine=True)
          )
        )
        (pool_k): Pool(
          (pool): Conv3d(96, 96, kernel_size=(3, 3, 3), stride=(1, 8, 8), padding=(1, 1, 1), groups=96, bias=False)
          (norm_act): Sequential(
            (0): LayerNorm((96,

In [None]:
import numpy as np

In [None]:
def predict_video(video_file_path, SEQUENCE_LENGTH):
    video_reader = cv2.VideoCapture(video_file_path)

    original_video_width = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))
    original_video_height = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))

    frames_list = []

    predicted_class_name = ''

    video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
    skip_frames_window = max(int(video_frames_count/SEQUENCE_LENGTH),1)

    for frame_counter in range(SEQUENCE_LENGTH):

        video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * skip_frames_window)
        success, frame = video_reader.read()

        if not success:
            break

        resized_frame = cv2.resize(frame, (224, 224))
        normalized_frame = resized_frame / 255
        frames_list.append(normalized_frame)

    predicted_labels_probabilities = model(torch.tensor(np.transpose(np.expand_dims(frames_list, axis = 0), (0,4, 1, 2, 3))).float().to("cuda"))
    predicted_label = torch.argmax(predicted_labels_probabilities)
    print(predicted_label)
    video_reader.release()

for pth in os.listdir("data/not Shop Lifters"):
    input_video_file_path = "data/not Shop Lifters/"+pth

    predict_video(input_video_file_path, 16)
    

tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(1, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(0, device='cuda:0')
tensor(1, device='cuda:0')
tensor(0, device='cuda:0')
t