## Video Tracking and Behavior Recognition using Deep Learning

<hr />

#### Implemented in Python, pytorch, using HDMB51 dataset

Ensure that pytorch, torchvision, opencv and cuda are installed to execute this code

In [4]:
import torch
#check for pytorch version
print(f"PyTorch version -- {torch.__version__}")
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import random_split, DataLoader
from torch.optim.lr_scheduler import StepLR
import torchvision
from torchvision import get_video_backend
from torchvision.models.video import r3d_18 
from torchvision import transforms
import os
from tqdm.auto import tqdm
import numpy as np
import time
import av
import random
print(f"PyAV version -- {av.__version__}")

SEED = 491
torch.manual_seed(SEED)

from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')

PyTorch version -- 2.0.0
PyAV version -- 10.0.0


In [27]:
# diagnostics for PyAV installation.
def run_av_diagnostics():
    import av
    av.open("video_data/brush_hair/brush_hair/Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np1_ba_goo_4.avi")
    print(get_video_backend())
    av.logging.set_level(av.logging.ERROR)
    if not hasattr(av.video.frame.VideoFrame, 'pict_type'):
      print("Unavailable!")

run_av_diagnostics()

pyav


In [39]:
import torchvision.transforms as T

In [51]:
# set up a sample pytorch class to check if it works
class Model(nn.Module):
  def __init__(self):
      super(Model, self).__init__()
      self.base_model = nn.Sequential(*list(r3d_18(pretrained=False).children())[:-1])
      self.fc1 = nn.Linear(512, 51)

  def forward(self, x):
      out = self.base_model(x).squeeze(4).squeeze(3).squeeze(2)
      print("size after pretrained model ", out.size())
      out = torch.log_softmax(self.fc1(out), dim=1)
      return out

check = Model().cuda()
out = check(torch.randn(16, 3 , 8, 112,112).cuda())
out.size()

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [16]:
# Is pytorch working
class AverageMeter(object):
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def update_acc(self, val, n=1):
        self.val = val/n
        self.sum += val
        self.count += n
        self.avg = self.sum / self.count

In [17]:
# define model using PyTorch
class VideoRecognitionModel(nn.Module):
  def __init__(self):
      super(VideoRecognitionModel, self).__init__()
      self.base_model = nn.Sequential(*list(r3d_18(pretrained=True).children())[:-1])
      self.fc1 = nn.Linear(512, 51)
      self.fc2 = nn.Linear(51, 51) 
      self.dropout = nn.Dropout2d(0.3) 

  def forward(self, x):
      out = self.base_model(x).squeeze(4).squeeze(3).squeeze(2)
      out = F.relu(self.fc1(out)) 
      out = self.dropout(out) 
      out = torch.log_softmax(self.fc2(out), dim=1)
      return out

In [42]:
def train(config, model, loader, optimizer, epoch):
    model.train()
    config = {}
    config['log_interval'] = 100
    correct = 0
    total_loss = 0.0
    flag = 0
    Loss, Acc = AverageMeter(), AverageMeter()
    start = time.time()
    for batch_id, data in enumerate(loader):
        data, target = data[0], data[-1]
        # print("here")

        if torch.cuda.is_available():
           data = data.cuda()
           target = target.cuda()

        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        Loss.update(loss.item(), data.size(0))

        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        num_corrects = pred.eq(target.view_as(pred)).sum().item()
        correct += num_corrects

        Acc.update_acc(num_corrects, data.size(0))

        if flag!= 0 and batch_id%config['log_interval'] == 0:
           print('Train Epoch: {} Batch [{}/{} ({:.0f}%)]\tLoss: {:.6f} Accuracy: {}/{} ({:.0f})%'.format(
                epoch, batch_id * len(data), len(loader.dataset),
                100. * batch_id / len(loader), Loss.avg, correct, Acc.count, 100. * Acc.avg))
        flag = 1

    #total_loss /= len(loader.dataset) 
    print('Train Epoch: {} Average Loss: {:.6f} Average Accuracy: {}/{} ({:.0f})%'.format(
         epoch, Loss.avg, correct, Acc.count, 100. * Acc.avg ))
    print(f"Takes {time.time() - start}")

In [19]:
def test(config, model, loader, text='Validation'):
    model.eval()
    correct = 0
    total_loss = 0.0
    Loss, Acc = AverageMeter(), AverageMeter()
    with torch.no_grad():
         for batch_id, data in enumerate(loader):
             data, target = data[0], data[-1]

             if torch.cuda.is_available():
                data = data.cuda()
                target = target.cuda()

             output = model(data)
             loss = F.nll_loss(output, target)
             total_loss += loss.item()

             Loss.update(loss.item(), data.size(0))

             pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
             num_corrects = pred.eq(target.view_as(pred)).sum().item()
             correct += num_corrects

             Acc.update_acc(num_corrects, data.size(0))
           
    total_loss /= len(loader.dataset)
    print(text + ' Average Loss: {:.6f} Average Accuracy: {}/{} ({:.0f})%'.format(
         Loss.avg, correct, Acc.count , 100. * Acc.avg ))

In [53]:
# Datasets and Dataloaders for model training ..

val_split = 0.05
num_frames = 16 # 16
clip_steps = 50
num_workers = 8
pin_memory = True
train_tfms = torchvision.transforms.Compose([
                                 T.ToPILImage(),
                                 T.Resize((128, 171)),
                                 T.RandomCrop((112,112)),                                 
                                 T.RandomHorizontalFlip(),
                                 T.ToTensor(),
                                 T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),
                                 lambda x: np.rollaxis(x.numpy(), 0, 3)
                               ])  
test_tfms =  torchvision.transforms.Compose([
                                             T.ToPILImage(),
                                             T.Resize((128, 171)),
                                             T.CenterCrop((112,112)),
                                             T.ToTensor(),                                             
                                             T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
                                             ])
hmdb51_train = torchvision.datasets.HMDB51('video_data/', 'test_train_splits/', num_frames,
                                                step_between_clips = clip_steps, fold=1, train=True,
                                                transform=train_tfms, num_workers=num_workers)


hmdb51_test = torchvision.datasets.HMDB51('video_data/', 'test_train_splits/', num_frames,
                                                step_between_clips = clip_steps, fold=1, train=False,
                                                transform=test_tfms, num_workers=num_workers)
      
total_train_samples = len(hmdb51_train)
total_val_samples = round(val_split * total_train_samples)

print(f"number of train samples {total_train_samples}")
print(f"number of validation samples {total_val_samples}")
print(f"number of test samples {len(hmdb51_test)}")

100%|██████████| 423/423 [15:25<00:00,  2.19s/it]  
100%|██████████| 423/423 [18:36<00:00,  2.64s/it]  


number of train samples 7754
number of validation samples 388
number of test samples 3234


In [54]:
bs = 4
lr = 1e-2
gamma = 0.7
total_epochs = 10
config = {}
num_workers = 0

kwargs = {'num_workers':num_workers, 'pin_memory':True} if torch.cuda.is_available() else {'num_workers':num_workers}
#kwargs = {'num_workers':num_workers}
#kwargs = {}

hmdb51_train_v1, hmdb51_val_v1 = random_split(hmdb51_train, [total_train_samples - total_val_samples,
                                                                       total_val_samples])

#hmdb51_train_v1.video_clips.compute_clips(16, 1, frame_rate=30)
#hmdb51_val_v1.video_clips.compute_clips(16, 1, frame_rate=30)
#hmdb51_test.video_clips.compute_clips(16, 1, frame_rate=30)

#train_sampler = RandomClipSampler(hmdb51_train_v1.video_clips, 5)
#test_sampler = UniformClipSampler(hmdb51_test.video_clips, 5)
  
train_loader = DataLoader(hmdb51_train_v1, batch_size=bs, shuffle=True, **kwargs)
val_loader   = DataLoader(hmdb51_val_v1, batch_size=bs, shuffle=True, **kwargs)
test_loader  = DataLoader(hmdb51_test, batch_size=bs, shuffle=False, **kwargs)

model = VideoRecognitionModel()

if torch.cuda.is_available():
   model = model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

print("Action Recognition Model Training")
for epoch in range(1, total_epochs + 1):
    train(config, model, train_loader, optimizer, epoch)
    test(config, model, val_loader, text="Validation")
    scheduler.step()

test(config, model, test_loader, text="Test")

Action Recognition Model Training


ValueError: pic should be 2/3 dimensional. Got 4 dimensions.