In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
cd "/content/gdrive/My Drive/fyp1"

/content/gdrive/My Drive/fyp1


In [None]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
#import torchvision
#import torchvision.models as models
#from torch.utils.data import Dataset
#import torch.optim as optim
#from torch.optim import lr_scheduler

from PIL import Image
import cv2 as cv
import json
import random

In [None]:
print(torch.__version__)

1.12.1+cu113


---
# 1. I3D module

In [None]:
class MaxPool3dSamePadding(nn.MaxPool3d):
    
    def compute_pad(self, dim, s):
        if s % self.stride[dim] == 0:
            return max(self.kernel_size[dim] - self.stride[dim], 0)
        else:
            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)

    def forward(self, x):
        # compute 'same' padding
        (batch, channel, t, h, w) = x.size()
        out_t = np.ceil(float(t) / float(self.stride[0]))
        out_h = np.ceil(float(h) / float(self.stride[1]))
        out_w = np.ceil(float(w) / float(self.stride[2]))
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)

        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f

        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        x = F.pad(x, pad)

        return super().forward(x)
    
class Unit3D(nn.Module):

    def __init__(self, in_channels,
                 output_channels,
                 kernel_shape=(1, 1, 1),
                 stride=(1, 1, 1),
                 padding=0,
                 activation_fn=F.relu,
                 use_batch_norm=True,
                 use_bias=False,
                 name='unit_3d'):
        
        """Initializes Unit3D module."""
        super(Unit3D, self).__init__()
        
        self._output_channels = output_channels
        self._kernel_shape = kernel_shape
        self._stride = stride
        self._use_batch_norm = use_batch_norm
        self._activation_fn = activation_fn
        self._use_bias = use_bias
        self.name = name
        self.padding = padding
        
        self.conv3d = nn.Conv3d(in_channels=in_channels,
                                out_channels=self._output_channels,
                                kernel_size=self._kernel_shape,
                                stride=self._stride,
                                padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
                                bias=self._use_bias)
        
        if self._use_batch_norm:
            self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)

    def compute_pad(self, dim, s):
        if s % self._stride[dim] == 0:
            return max(self._kernel_shape[dim] - self._stride[dim], 0)
        else:
            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)

            
    def forward(self, x):
        # compute 'same' padding
        (batch, channel, t, h, w) = x.size()
        out_t = np.ceil(float(t) / float(self._stride[0]))
        out_h = np.ceil(float(h) / float(self._stride[1]))
        out_w = np.ceil(float(w) / float(self._stride[2]))
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)

        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f

        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        x = F.pad(x, pad)   

        x = self.conv3d(x)
        if self._use_batch_norm:
            x = self.bn(x)
        if self._activation_fn is not None:
            x = self._activation_fn(x)
            
        return x

class InceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels, name):
        super(InceptionModule, self).__init__()

        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
                         name=name+'/Branch_0/Conv3d_0a_1x1')
        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_1/Conv3d_0a_1x1')
        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_1/Conv3d_0b_3x3')
        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_2/Conv3d_0a_1x1')
        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_2/Conv3d_0b_3x3')
        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
                                stride=(1, 1, 1), padding=0)
        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_3/Conv3d_0b_1x1')
        self.name = name

    def forward(self, x):    
        b0 = self.b0(x)
        b1 = self.b1b(self.b1a(x))
        b2 = self.b2b(self.b2a(x))
        b3 = self.b3b(self.b3a(x))
        return torch.cat([b0,b1,b2,b3], dim=1)

class InceptionI3d(nn.Module):
    """Inception-v1 I3D architecture.
    The model is introduced in:
        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
        Joao Carreira, Andrew Zisserman
        https://arxiv.org/pdf/1705.07750v1.pdf.
    See also the Inception architecture, introduced in:
        Going deeper with convolutions
        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
        http://arxiv.org/pdf/1409.4842v1.pdf.
    """

    # Endpoints of the model in order. During construction, all the endpoints up
    # to a designated `final_endpoint` are returned in a dictionary as the
    # second return value.
    VALID_ENDPOINTS = (
        'Conv3d_1a_7x7',
        'MaxPool3d_2a_3x3',
        'Conv3d_2b_1x1',
        'Conv3d_2c_3x3',
        'MaxPool3d_3a_3x3',
        'Mixed_3b',
        'Mixed_3c',
        'MaxPool3d_4a_3x3',
        'Mixed_4b',
        'Mixed_4c',
        'Mixed_4d',
        'Mixed_4e',
        'Mixed_4f',
        'MaxPool3d_5a_2x2',
        'Mixed_5b',
        'Mixed_5c',
        'Logits',
        'Predictions',
    )

    def __init__(self, num_classes=400, spatial_squeeze=True,
                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
        """Initializes I3D model instance.
        Args:
          num_classes: The number of outputs in the logit layer (default 400, which
              matches the Kinetics dataset).
          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
              before returning (default True).
          final_endpoint: The model contains many possible endpoints.
              `final_endpoint` specifies the last endpoint for the model to be built
              up to. In addition to the output at `final_endpoint`, all the outputs
              at endpoints up to `final_endpoint` will also be returned, in a
              dictionary. `final_endpoint` must be one of
              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
          name: A string (optional). The name of this module.
        Raises:
          ValueError: if `final_endpoint` is not recognized.
        """

        if final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % final_endpoint)

        super(InceptionI3d, self).__init__()
        self._num_classes = num_classes
        self._spatial_squeeze = spatial_squeeze
        self._final_endpoint = final_endpoint
        self.logits = None

        if self._final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)

        self.end_points = {}
        end_point = 'Conv3d_1a_7x7'
        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
        if self._final_endpoint == end_point: return
        
        end_point = 'MaxPool3d_2a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return
        
        end_point = 'Conv3d_2b_1x1'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return
        
        end_point = 'Conv3d_2c_3x3'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_3a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return
        
        end_point = 'Mixed_3b'
        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_3c'
        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_4a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4b'
        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4c'
        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4d'
        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4e'
        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4f'
        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_5a_2x2'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5b'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5c'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Logits'
        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
                                     stride=(1, 1, 1))
        self.dropout = nn.Dropout(dropout_keep_prob)
        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')

        self.build()


    def replace_logits(self, num_classes):
        self._num_classes = num_classes
        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')
        
    def build(self):
        for k in self.end_points.keys():
            self.add_module(k, self.end_points[k])
        
    def forward(self, x):
        print("inside forward")
        for end_point in self.VALID_ENDPOINTS:
            if end_point in self.end_points:
                print(f"forwarding {end_point}")
                x = self._modules[end_point](x) # use _modules to work with dataparallel
        print(f"before logit shape {x.shape}")
        x = self.logits(self.dropout(self.avg_pool(x)))
        print(f"after logit shape {x.shape}")
        if self._spatial_squeeze:
            logits = x.squeeze(3).squeeze(3)
            print(f"after squeeeze shape {logits.shape}")
        # logits is batch X time X classes, which is what we want to work with
        return logits

    def extract_features(self, x, myEndPoint):
        #x (original size of frame) = (b,c,nframe,h,w):(1,3,16,240,320)
        #x_cropped                  = (b,c,nframe,h,w):(1,3,16,210,280)

        #I3D feature extraction
        for end_point in self.VALID_ENDPOINTS:
            if end_point in self.end_points:
                x = self._modules[end_point](x)
            if end_point == myEndPoint:
                break
        #====================================
        # x         => (b,c,t,h,w):(1,1024,2,8,10)
        # x_cropped => (b,c,t,h,w):(1,1024,2,7,9)
        x = torch.norm(x, dim=2) #l2 normalization for t dimension
        # x         => (b,c,t,h,w):(1,1024,1,8,10)
        # x_cropped => (b,c,t,h,w):(1,1024,1,7,9)
        spatial_fea = torch.squeeze(x) 
        # spatial_fea         => (c,h,w):(1024,8,10)
        # spatial_fea_cropped => (c,h,w):(1024,7,9)

        x = F.adaptive_avg_pool2d(x, (1, 1)) #global pooling
        # x         => (b,c,t,h,w):(1,1024,1,1,1)
        # x_cropped => (b,c,t,h,w):(1,1024,1,1,1)
        x = torch.squeeze(x) #sequeeze batch(for ori/random-crop data) and spatial dimension
        # x         => (c,):(1024,)
        # x_cropped => (c,):(1024,)
        pooling_fea = x
        
        return spatial_fea, pooling_fea
        #============================================
        # spatial_fea         => (c,h,w):(1024,8,10)
        # spatial_fea_cropped => (c,h,w):(1024,7,9)
        # pooling_fea         => (c,):(1024,)
        # pooling_fea_cropped => (c,):(1024,)
        #============================================

In [None]:
def custom_I3D(mode='rgb', load_model='./rgb_imagenet.pt'):
    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
    else:
        i3d = InceptionI3d(400, in_channels=3)
    i3d.load_state_dict(torch.load(load_model))
    return i3d

In [None]:
def feature_extract(i3d, x, myEndPoint):
    #get available device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    #pass to device available
    i3d = i3d.to(device)
    x = x.to(device)

    i3d.eval()  # Set model to evaluate mode
    #x (original size of frame) = (b,c,nframe,h,w):(1,3,16,240,320)
    #x_cropped                  = (b,c,nframe,h,w):(1,3,16,210,280)
    with torch.no_grad():
        spatial_fea, pooling_fea = i3d.extract_features(x, myEndPoint)
    
    return spatial_fea, pooling_fea
    #============================================
    # spatial_fea         => (c,h,w):(1024,8,10)
    # spatial_fea_cropped => (c,h,w):(1024,7,9)
    # pooling_fea         => (c,):(1024,)
    # pooling_fea_cropped => (c,):(1024,)
    #============================================

---
# 2. Helper function

In [None]:
def getVideoPath(dir_path):
    video_paths = []

    video_names = os.listdir(dir_path) #eg. video_names -> [Robbery1.mp4, Robbery2.mp4]
    for video_name in video_names:
        video_path = os.path.join(dir_path, video_name) #eg. video_path -> ./dataset/Robbery/Robbery102_x264.mp4
        video_paths.append(video_path)
    return video_paths

In [None]:
transform = transforms.Compose([
    #transforms.Resize((240,320)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [None]:
def extract_feature(video_path, i3d, category, randCrop_version):
    #get available device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    #================= data augmentation based on category ===============
    if category == "random-crop":
        #cropped size = (h,w):(210,280)
        #height = random starting point of height, from 0~29
        height_top = random.randint(0, 30) 
        height_bottom = height_top + 210
        #width = random starting point of width, from 0~39
        width_left = random.randint(0, 40)
        width_right = width_left + 280

        #random flip
        random_flip = random.randint(0, 1)
        if random_flip == 0:
            random_flip = False
        else:
            random_flip = True
    elif category == "ori":
        #original size = (h,w):(240,320)
        height_top = 0
        height_bottom = 240
        width_left = 0
        width_right = 320
        random_flip = False
    elif category == "10-crop":
        #first crop = topleft (height_top=0, width_left=0)
        #second crop = top right (height_top=0, width_left=40)
        #third crop = bottom left (height_top=30 ,width_left=0)
        #fourth crop = bottom right (height_top=30 ,width_left=40)
        #center point = (120,160)
        #top left corner of center region = (120-105, 160-140)
        #fifth crop = center (height_top=0 ,width_left=0)
        #6~10 = flip of 1~5
        tenCrop_loc = [[0,0], [0,40], [30,0], [30,40], [15,20],
                       [0,0], [0,40], [30,0], [30,40], [15,20]]
        height_top = tenCrop_loc[randCrop_version][0]
        height_bottom = height_top + 210
        width_left = tenCrop_loc[randCrop_version][1]
        width_right = width_left + 280

        if randCrop_version <= 4:
            random_flip = False
        else: #for version 5~9
            random_flip = True
    
    f = [] #store 16-frame / 1 clip. reset to empty after feature extracted
    total_frame = 0 #total number of frame
    curr_nclip = 0   #current number of clip
    max_nclip = 21000 #WE ASSUME THE MAXIMUM #CLIP FOR A VIDEO IS 21000 CLIPS (enough for 3 hour long video) approximately 6GB
    isFirstClip = True
    
    #============== create tensor for transformed 16-frame ============
    if category == "ori":
        sixteen_frames = torch.randn(16,3,240,320) #store tensor that transformed from PIL image. (nframe,c,h,w):(16,3,240,320)
    elif category == "random-crop" or category == "10-crop":
        sixteen_frames = torch.randn(16,3,210,280) #store tensor that transformed from PIL image. (nframe,c,h,w):(16,3,210,280)

    #============== load video ===============
    cap = cv.VideoCapture(video_path)
    #if video cannot open
    if (cap.isOpened()== False):
        raise Exception("Error opening video file")

    while(cap.isOpened()):
        # Capture frame-by-frame
        ret, frame = cap.read()

        #if video not end yet OR total_frame less than 512. (minimum nframes = 32 segment * 16 frames ==> 512)
        if ret == True or total_frame < 512:

            #if video end but total_frame less than 512
            if ret == False and total_frame < 512:
                frame = np.zeros((240, 320 , 3), dtype=np.uint8) #create black blank frame

            total_frame += 1 

            #================ crop frame ================
            frame = frame[height_top:height_bottom, width_left:width_right]
            #================ flip frame ================
            if random_flip == True:
                frame = cv.flip(frame, 1) #flip horizontally

            #rescale pixel value from 0~255 to 0~1
            #frame = frame/255

            #convert to PIL image
            frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)

            #store current frame into a clip
            f.append(frame)

            #================ for each 16-frames, extract I3D feature ================
            if len(f) == 16:
                for i, frame in enumerate(f):
                    #================ transformation ================
                    frame = transform(frame)
                    sixteen_frames[i] = frame  #torch.cat()  is time consuming !!! 
                    
                frames = torch.transpose(sixteen_frames, 0, 1) #convert (nframe,c,h,w) to (c,nframe,h,w)
                frames = torch.unsqueeze(frames, 0) #add batch dimension --> frames = (b,c,nframe,h,w)
                frames = frames.to(device) #pass to device available

                #================ extract I3D feature ================
                with torch.no_grad():
                    spatial_fea, pooling_fea = feature_extract(i3d, frames, myEndPoint="Mixed_5c") 
                    spatial_fea = spatial_fea.to("cpu")
                    pooling_fea = pooling_fea.to("cpu")
                    # spatial_fea (ori/random-crop) => feature with preserved spatial dimension  (c,h,w):(1024,h,w)
                    # pooling_fea (ori/random-crop) => feature with summarized spatial dimension (c,):(1024,)
                    if torch.any(torch.isnan(spatial_fea)).item() == True or torch.any(torch.isnan(pooling_fea)).item() == True:
                        raise Exception("[!] nan value exist after forward to model")
                    if isFirstClip == True:
                        clips_spatial_fea = torch.randn((max_nclip, 1024, spatial_fea.shape[1], spatial_fea.shape[2])) #(nclip,c,h,w)
                        clips_pooling_fea = torch.randn((max_nclip, 1024)) #(nclip,c):(?, 1024)
                        isFirstClip = False
                    clips_spatial_fea[curr_nclip] = spatial_fea.clone().detach()
                    clips_pooling_fea[curr_nclip] = pooling_fea.clone().detach()   #torch.cat() is time consuming

                curr_nclip += 1

                if curr_nclip >= max_nclip:
                    raise Exception("[!] total number of clips exceed defined maximum number of clips")

                f = [] #reset list that store 16-frames
        #if video end AND total_frame more than 512
        else:
            if total_frame//16 != curr_nclip:
                raise Exception("[!] total_frame//16 != curr_nclip. total_frame", total_frame, "curr_nclip", curr_nclip)
            if curr_nclip < 32:
                raise Exception("[!] Required at least 32 clips but only having", curr_nclip)
            clips_spatial_fea = clips_spatial_fea[:curr_nclip, :].clone().detach() #remove unwanted memory 
            clips_pooling_fea = clips_pooling_fea[:curr_nclip, :].clone().detach() #remove unwanted memory 
            
            return clips_spatial_fea, clips_pooling_fea
            #clips_spatial_fea = (nclip,c,h,w)
            #clips_pooling_fea = (nclip,c):(?, 1024)

In [None]:
#special extract step for long video (5~9 hour)
def hardcode_extract(video_path, i3d, category, randCrop_version):
    """for long video, slower"""
    #get available device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    #================= data augmentation based on category ===============
    if category == "random-crop":
        #cropped size = (h,w):(210,280)
        #height = random starting point of height, from 0~29
        height_top = random.randint(0, 30) 
        height_bottom = height_top + 210
        #width = random starting point of width, from 0~39
        width_left = random.randint(0, 40)
        width_right = width_left + 280

        #random flip
        random_flip = random.randint(0, 1)
        if random_flip == 0:
            random_flip = False
        else:
            random_flip = True
    elif category == "ori":
        #original size = (h,w):(240,320)
        height_top = 0
        height_bottom = 240
        width_left = 0
        width_right = 320
        random_flip = False
    elif category == "10-crop":
        #first crop = topleft (height_top=0, width_left=0)
        #second crop = top right (height_top=0, width_left=40)
        #third crop = bottom left (height_top=30 ,width_left=0)
        #fourth crop = bottom right (height_top=30 ,width_left=40)
        #center point = (120,160)
        #top left corner of center region = (120-105, 160-140)
        #fifth crop = center (height_top=0 ,width_left=0)
        #6~10 = flip of 1~5
        tenCrop_loc = [[0,0], [0,40], [30,0], [30,40], [15,20],
                       [0,0], [0,40], [30,0], [30,40], [15,20]]
        height_top = tenCrop_loc[randCrop_version][0]
        height_bottom = height_top + 210
        width_left = tenCrop_loc[randCrop_version][1]
        width_right = width_left + 280

        if randCrop_version <= 4:
            random_flip = False
        else: #for version 5~9
            random_flip = True

    #================ get gaps for 32 segment =========================
    if "Normal_Videos308_x264.mp4" in video_path:
        gaps = [0,  1907,  3814,  5722,  7629,  9536, 11443, 13351, 15258, 17165, 19072, 20979, 22887, 24794, 26701, 28608, 30516, 32423, 34330, 36237, 38144, 40052, 41959, 43866, 45773, 47680, 49588, 51495, 53402, 55309, 57217, 59124, 61031]
    elif "Normal_Videos307_x264.mp4" in video_path:
        gaps = [0,  1227,  2453,  3680,  4906,  6133,  7360,  8586,  9813, 11039, 12266, 13493, 14719, 15946, 17172, 18399, 19626, 20852, 22079, 23305, 24532, 25758, 26985, 28212, 29438, 30665, 31891, 33118, 34345, 35571, 36798, 38024, 39251]
    elif "Normal_Videos633_x264.mp4" in video_path:
        gaps = [0,   862,  1724,  2586,  3448,  4310,  5171,  6033,  6895,  7757, 8619,  9481, 10343, 11205, 12067, 12929, 13790, 14652, 15514, 16376, 17238, 18100, 18962, 19824, 20686, 21548, 22410, 23271, 24133, 24995, 25857, 26719, 27581]

    f = [] #store 16-frame / 1 clip. reset to empty after feature extracted
    total_frame = 0 #total number of frame
    isFirstClip = True

    #============== create tensor for transformed 16-frame ============
    if category == "ori":
        sixteen_frames = torch.randn(16,3,240,320) #(nframe,c,h,w):(16,3,240,320)
    elif category == "random-crop" or category == "10-crop":
        sixteen_frames = torch.randn(16,3,210,280) #(nframe,c,h,w):(16,3,210,280)

    #============== load video =============
    cap = cv.VideoCapture(video_path)
    #if video cannot open
    if (cap.isOpened()== False):
        raise Exception("Error opening video file")
    #while video is opened
    while(cap.isOpened()):
        # Capture frame-by-frame
        ret, frame = cap.read()

        #if video not end yet 
        if ret == True:
            total_frame += 1 

            #============= crop frame ================
            frame = frame[height_top:height_bottom, width_left:width_right]
            #============= flip frame ================
            if random_flip == True:
                frame = cv.flip(frame, 1) #flip horizontally

            #rescale pixel value from 0~255 to 0~1
            #frame = frame/255

            #============= convert to PIL image ===========
            frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)

            #store current frame into a clip
            f.append(frame)

            #===============for each 16-frames, extract I3D feature ===================
            if len(f) == 16:
                #=============== transformation for 16-frames =====================
                for i, frame in enumerate(f):
                    #transformation
                    frame = transform(frame)
                    sixteen_frames[i] = frame  #torch.cat()  is time consuming !!! 
                    
                frames = torch.transpose(sixteen_frames, 0, 1) #convert (nframe,c,h,w) to (c,nframe,h,w)
                frames = torch.unsqueeze(frames, 0) #add batch dimension --> frames = (b,c,nframe,h,w)
                
                #pass to device available
                frames = frames.to(device)

                #================ extract clip feature (16 frames) ============
                with torch.no_grad():
                    spatial_fea, pooling_fea = feature_extract(i3d, frames, myEndPoint="Mixed_5c") 
                    # spatial_fea  => feature with preserved spatial dimension  (c,h,w):(1024,h,w)
                    # pooling_fea  => feature with summarized spatial dimension (c,):(1024,)
                    spatial_fea = spatial_fea.to("cpu")
                    pooling_fea = pooling_fea.to("cpu")
                    
                if torch.any(torch.isnan(spatial_fea)).item() == True or torch.any(torch.isnan(pooling_fea)).item() == True:
                    raise Exception("[!] nan value exist after forward to model")
                #=================== create tensor based on the spatial dimension of x ====================
                if isFirstClip == True:
                    #create small tensor for nclip for 1 segment
                    max_nclip = 2000 #assume maximum nclip for 1 segment is 2000
                    curr_clip_idx = 0
                    curr_nclip = 0

                    clips_spatial_fea = torch.randn((max_nclip, 1024, spatial_fea.shape[1], spatial_fea.shape[2])) #(nclip,c,h,w)
                    clips_pooling_fea = torch.randn((max_nclip, 1024)) #(nclip,c):(?, 1024)

                    segm_spatial_fea = torch.randn((32,1024,spatial_fea.shape[1], spatial_fea.shape[2])) #(nsegment,c,h,w)
                    segm_pooling_fea = torch.randn((32,1024)) #(nsegment,c)
                    
                    isFirstClip = False
                clips_spatial_fea[curr_clip_idx] = spatial_fea.clone().detach()
                clips_pooling_fea[curr_clip_idx] = pooling_fea.clone().detach()   #torch.cat() is time consuming
                curr_clip_idx += 1 
                curr_nclip += 1
                #=============== when nclip is enough for 1 segment, convert clips to 1 segment ===================
                if curr_nclip in gaps and curr_nclip != 0:
                    idx = gaps.index(curr_nclip)-1
                    #========= segment for spatial feature ==========
                    x = clips_spatial_fea[0:curr_clip_idx,:] # x = (nclip,c,h,w)
                    segm_spatial_fea[idx] = x.mean(dim=0)

                    #========= segment for pooling feature ==========
                    x = clips_pooling_fea[0:curr_clip_idx,:] # x = (nclip,c)
                    segm_pooling_fea[idx] = x.mean(dim=0)

                    #reset idx to 0
                    curr_clip_idx = 0

                if curr_clip_idx > max_nclip:
                    raise Exception("[!] total number of clips exceed defined maximum number of clips")

                f = [] #reset list that store 16-frames

        else: #video end
            if total_frame//16 != curr_nclip:
                raise Exception("[!] total_frame//16 != curr_nclip. total_frame", total_frame, "curr_nclip", curr_nclip)
            print("total clips:", curr_nclip, end="\t")
            segm_spatial_fea = segm_spatial_fea.permute(1,0,2,3) #convert (nsegment,c,h,w) to (c,nsegment,h,w)
            segm_pooling_fea = segm_pooling_fea.permute(1,0)     #convert (nsegment,c) to (c,nsegment)
            return segm_spatial_fea, segm_pooling_fea
            #segm_spatial_fea = (c,nsegment,h,w):(1024,32,?,?)
            #segm_pooling_fea = (c,nsegment):(1024, 32)


In [None]:
def to32_segments(x_spatial, x):
    """
    For ori/random-crop:
    x = (nclips, c):(?,1024)
    output segments_pooling_fea = (c,nsegments):(1024,32)
    
    x_spatial = (nclip,c,h,w):(?,1024,?,?)
    output segments_spatial_fea = (c,nsegment,h,w):(1024,32,?,?)"""

    gaps = torch.round(torch.linspace(0, x.shape[0], 32+1)) #from 0 to nclips-1, create 33 points will have 32 between
    gaps = torch.tensor(gaps, dtype=torch.int32)

    #========== for pooling feature =================
    segments_pooling_fea = torch.randn(32, 1024) #(nsegment, c):(32,1024)

    for i in range(32):
        #print(f"segment {i}: clips {gaps[i]}~{gaps[i+1]-1}")
        segment = x[gaps[i]:gaps[i+1],:].clone().detach() #(nclip, c), torch.tensor(x) is equivalent to x.clone().detach()
        segment = torch.mean(segment, 0) #mean in clip dimension, (c,):(1024)
        if torch.any(torch.isnan(segment)).item() == True:
            raise Exception("[!] nan value exist when seperating to 32 segments", segment)
        segments_pooling_fea[i] = segment.clone().detach()
    segments_pooling_fea = torch.transpose(segments_pooling_fea, 0, 1) #convert (nsegment,c) to (c,nsegment):(1024,32)

    #========== for spatial feature ===============
    segments_spatial_fea = torch.randn(32, 1024, x_spatial.shape[2], x_spatial.shape[3]) #(nsegment,c,h,w):(?,1024,?,?)

    for i in range(32):
        #print(f"segment {i}: clips {gaps[i]}~{gaps[i+1]-1}")
        segment = x_spatial[gaps[i]:gaps[i+1],:,:,:].clone().detach() #(nclip, c,h,w)
        segment = torch.mean(segment, 0) #mean in clip dimension, (c,h,w):(1024,?,?)
        if torch.any(torch.isnan(segment)).item() == True:
            raise Exception("[!] nan value exist when seperating to 32 segments", segment)
        segments_spatial_fea[i] = segment.clone().detach()
    segments_spatial_fea = segments_spatial_fea.permute(1,0,2,3) #convert (nsegment,c,h,w) to (c,nsegment,h,w)

    return segments_spatial_fea, segments_pooling_fea 
    #segments_pooling_fea = (c,nsegment):(1024, 32)
    #segments_spatial_fea = (c,nsegment,h,w):(1024,32,?,?)

In [None]:
def createFolder(save_dir):
    """This function will create folder if the folder is not created"""
    paths = []
    paths.append(save_dir)  #"./feature_extracted/"
    train_dir = os.path.join(save_dir, "train") #"./feature_extracted/train/"
    paths.append(train_dir) 
    paths.append(os.path.join(train_dir, "ori")) #"./feature_extracted/train/ori/"
    paths.append(os.path.join(train_dir, "ori_spatial")) #"./feature_extracted/train/ori_spatial/"
    paths.append(os.path.join(train_dir, "10-crop")) #"./feature_extracted/train/10-crop/"
    paths.append(os.path.join(train_dir, "10-crop_spatial")) #"./feature_extracted/train/10-crop_spatial/"
    paths.append(os.path.join(train_dir, "random-crop")) #"./feature_extracted/train/random-crop/"
    paths.append(os.path.join(train_dir, "random-crop_spatial")) #"./feature_extracted/train/random-crop_spatial/"

    test_dir = os.path.join(save_dir, "test") #"./feature_extracted/test/"
    paths.append(test_dir)
    paths.append(os.path.join(test_dir, "10-crop")) #"./feature_extracted/test/10-crop/"
    paths.append(os.path.join(test_dir, "10-crop_spatial")) #"./feature_extracted/test/10-crop_spatial/"
    
    for i in paths:
        if os.path.exists(i) == False: 
            os.mkdir(i)
            print("created folder", i)
        else:
            print(i, "ready")


In [None]:
def get_save_path(video_path, save_dir, isTrain, category=None, randCrop_version=None):
    """This function will return a path that the feature will be saved to"""
    # eg. video_path = './dataset/Shoplifting/Shoplifting021_x264.mp4'
    # eg. video_path = './dataset/Testing_Normal_Videos_Anomaly/Normal_Videos_003_x264.mp4'
    # eg. video_path = './dataset/Training-Normal-Videos-Part-1/Normal_Videos_003_x264.mp4'
    # eg. save_dir = "./feature_extracted/"

    dir_name = os.path.basename(os.path.dirname(video_path)) #eg. Shoplifting
    basename = os.path.basename(video_path) #eg. Shoplifting021_x264.mp4
    file_name = os.path.splitext(basename)[0] #eg. Shoplifting021_x264

    if isTrain == True:
        save_path = os.path.join(save_dir, "train", category, (file_name+"_version"+str(randCrop_version)+".pt")) # eg. "./feature_extracted/train/random-crop/Shoplifting021_x264_version1.pt"
        spatial_save_path = os.path.join(save_dir, "train", category+"_spatial", (file_name+"_version"+str(randCrop_version)+".pt")) # eg. "./feature_extracted/train/random-crop_spatial/Shoplifting021_x264_version1.pt"
    else:
        save_path = os.path.join(save_dir, "test", category, (file_name+"_version"+str(randCrop_version)+".pt")) # eg. "./feature_extracted/test/10-crop/Shoplifting021_x264_version0.pt"
        spatial_save_path = os.path.join(save_dir, "test", category+"_spatial", (file_name+"_version"+str(randCrop_version)+".pt")) # eg. "./feature_extracted/test/10-crop_spatial/Shoplifting021_x264_version0.pt"

    return save_path, spatial_save_path

In [None]:
def isTrainAnomaly(video_path, anomaly_train):
    """This function will return True if the video given is train data."""
    #eg. video_path = './dataset/Shoplifting/Shoplifting021_x264.mp4'
    dir_name = os.path.basename(os.path.dirname(video_path)) #eg. Shoplifting
    basename = os.path.basename(video_path) #eg. Shoplifting021_x264.mp4
    file_name = os.path.splitext(basename)[0] #eg. Shoplifting021_x264

    if (dir_name+"/"+basename+"\n") in anomaly_train:
        return True
    else:
        return False

In [None]:
def run(video_dir, save_dir, anomaly_train_txt=None, overwrite=False, category=None, randCrop_version=0): #category = ori, 10-crop, random-crop
    #read path
    #eg. video_dir = "./dataset/"
    video_paths = getVideoPath(video_dir) 
    #print(video_paths)
    print("===========", category, "version", randCrop_version, "=================")

    anomaly_train = [] # eg. anomaly_train = ['Testing_Normal_Videos_Anomaly/Normal_Videos_944_x264.mp4\n', 'Vandalism/Vandalism007_x264.mp4\n']

    isAnomaly = False
    isTrain = False
    if anomaly_train_txt == "Test_Normal":
        isTrain = False
        print("normal test")
    elif anomaly_train_txt == "Train_Normal":
        isTrain = True
        print("normal train")
    else:
        isAnomaly = True
        #read txt to know which video is train/test
        f = open(anomaly_train_txt, "r")
        for line in f: 
            anomaly_train.append(line)
        f.close()

    i3d = custom_I3D()
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    #pass to device
    i3d = i3d.to(device)
    i3d.eval()
    
    for i, video_path in enumerate(video_paths):
        print(f"[{i+1}/{len(video_paths)}] {os.path.basename(video_path)}", end="\t")

        #====== if the video is anomaly, check whether it is train data ======
        if isAnomaly == True:
            if isTrainAnomaly(video_path, anomaly_train) == True: #if current video is train video
                isTrain = True
                print("anomaly train", end="\t")
            else: #if current video is test video
                isTrain = False
                print("anomaly test", end="\t")

        if (isTrain == False and category == "ori") or (isTrain == False and category == "random-crop") or (isTrain == False and randCrop_version > 9): #version is only for random-crop
            print("is test data, skip")
            continue 
        

        #================ get save path ================
        save_path, spatial_save_path = get_save_path(video_path, save_dir, isTrain, category, randCrop_version)
        
        #================ check whether exist ================
        if os.path.exists(save_path) == True and os.path.exists(spatial_save_path) == True and overwrite == False: #if existed and dont want to overwrite
            print(save_path, "existed")
            continue
        #======== read video & extract clip (16-frames) feature =============
        #======= feature extraction with hardcode way for nclip per segment (for long video, 5~9 hours) ============
        if ("Normal_Videos308_x264.mp4" in video_path) or ("Normal_Videos307_x264.mp4" in video_path) or ("Normal_Videos633_x264.mp4" in video_path):
            segments_spatial_fea, segments_pooling_fea = hardcode_extract(video_path, i3d, category, randCrop_version)
            #segments_spatial_fea = (c,nsegment,h,w):(1024,32,?,?)
            #segments_pooling_fea = (c,nsegment):(1024, 32)
            print("segments_spatial_fea:", segments_spatial_fea.shape, end="\t")
            print("segments_pooling_fea:", segments_pooling_fea.shape, end="\t")
            print("spatial save path:", spatial_save_path, end="\t")
            print("save path:", save_path)
            #save feature extracted to save_path 
            torch.save(segments_spatial_fea, spatial_save_path)
            torch.save(segments_pooling_fea, save_path)
        

        #======== feature extraction with general segment separation for train data ============
        elif isTrain == True:
            clips_spatial_fea, clips_pooling_fea = extract_feature(video_path, i3d, category, randCrop_version)
            #clips_spatial_fea = (nclip,c,h,w):(?,1024,?,?)
            #clips_pooling_fea = (nclip,c):(?, 1024)

            #=================== seperate to 32 segments ===================
            segments_spatial_fea, segments_pooling_fea = to32_segments(clips_spatial_fea, clips_pooling_fea) 
            #segments_spatial_fea = (c,nsegment,h,w):(1024,32,?,?)
            #segments_pooling_fea = (c,nsegment):(1024, 32)

            print("segments_spatial_fea:", segments_spatial_fea.shape, end="\t")
            print("segments_pooling_fea:", segments_pooling_fea.shape, end="\t")
            print("spatial save path:", spatial_save_path, end="\t")
            print("save path:", save_path)
            #save feature extracted to save_path 
            torch.save(segments_spatial_fea, spatial_save_path)
            torch.save(segments_pooling_fea, save_path)

        #======== feature extraction with general segment separation for test data ============
        else:
            clips_spatial_fea, clips_pooling_fea = extract_feature(video_path, i3d, category, randCrop_version)
            #clips_spatial_fea = (nclip,c,h,w):(?,1024,?,?)
            #clips_pooling_fea = (nclip,c):(?, 1024)
            clips_spatial_fea = clips_spatial_fea.permute(1,0,2,3) #convert (nclip,c,h,w) to (c,nclip,h,w):(1024,?,?,?)
            clips_pooling_fea =  clips_pooling_fea.permute(1,0) #convert (nclip,c) to (c,nclip):(1024,?)
            print("clips_spatial_fea:", clips_spatial_fea.shape, end="\t\t")
            print("clips_pooling_fea:", clips_pooling_fea.shape, end="\t")
            print("spatial save path:", spatial_save_path, end="\t")
            print("save path:", save_path)
            #save feature extracted to save_path 
            torch.save(clips_spatial_fea, spatial_save_path)
            torch.save(clips_pooling_fea, save_path)


---
# 3. Run

In [None]:
createFolder(save_dir = "./feature_extracted/")

./feature_extracted/ ready
./feature_extracted/train ready
./feature_extracted/train/ori ready
./feature_extracted/train/ori_spatial ready
./feature_extracted/train/10-crop ready
./feature_extracted/train/10-crop_spatial ready
./feature_extracted/train/random-crop ready
./feature_extracted/train/random-crop_spatial ready
./feature_extracted/test ready
./feature_extracted/test/10-crop ready
./feature_extracted/test/10-crop_spatial ready


In [None]:
anomaly_dir = ["./dataset/Anomaly-Videos-Part-1/Abuse/","./dataset/Anomaly-Videos-Part-1/Arrest/","./dataset/Anomaly-Videos-Part-1/Arson/", "./dataset/Anomaly-Videos-Part-1/Assault/",
             "./dataset/Anomaly-Videos-Part-2/Burglary/","./dataset/Anomaly-Videos-Part-2/Explosion/","./dataset/Anomaly-Videos-Part-2/Fighting/",
             "./dataset/Anomaly-Videos-Part-3/RoadAccidents/","./dataset/Anomaly-Videos-Part-3/Robbery/","./dataset/Anomaly-Videos-Part-3/Shooting/",
             "./dataset/Anomaly-Videos-Part-4/Shoplifting/","./dataset/Anomaly-Videos-Part-4/Stealing/","./dataset/Anomaly-Videos-Part-4/Vandalism/"]
train_normal_dir = ["./dataset/Training-Normal-Videos-Part-1/","./dataset/Training-Normal-Videos-Part-2/"]
test_normal_dir = ["./dataset/Testing_Normal_Videos_Anomaly/"]

----
## 10-crop feature extraction (for train & test data)

In [None]:
#================= for train/test anomaly data ================
for i in anomaly_dir:
    #===== 10-crop for each video =========
    for j in range(10):
        run(video_dir=i, save_dir = "./feature_extracted/", anomaly_train_txt = "./dataset/Anomaly_Train.txt", overwrite=False, category="10-crop", randCrop_version=j)

In [None]:
#=============== for train normal data ===============
for i in train_normal_dir:
    for j in range(10):
        run(video_dir=i, save_dir = "./feature_extracted/", anomaly_train_txt = "Train_Normal", overwrite=False, category="10-crop", randCrop_version=j)

In [None]:
#=============== for test normal data ===============
for i in test_normal_dir:
    for j in range(10):
        run(video_dir=i, save_dir = "./feature_extracted/", anomaly_train_txt = "Test_Normal", overwrite=False, category="10-crop", randCrop_version=j)

---
## Original size feature extraction (for train data only)

In [None]:
#================= for train/test anomaly data (test data will be skipped)================
for i in anomaly_dir:
    run(video_dir=i, save_dir = "./feature_extracted/", anomaly_train_txt = "./dataset/Anomaly_Train.txt", overwrite=False, category="ori", randCrop_version=0)

In [None]:
#=============== for train normal data ===============
for i in train_normal_dir:
    run(video_dir=i, save_dir = "./feature_extracted/", anomaly_train_txt = "Train_Normal", overwrite=False, category="ori", randCrop_version=0)

---
## random-crop feature extraction (for train data only)

In [None]:
#================= for train/test anomaly data (test data will be skipped)================
#random 7version
for i in anomaly_dir:
    for j in range(7):
        run(video_dir=i, save_dir = "./feature_extracted/", anomaly_train_txt = "./dataset/Anomaly_Train.txt", overwrite=False, category="random-crop", randCrop_version=j)

In [None]:
#=============== for train normal data ===============
for i in train_normal_dir:
    for j in range(7):
        run(video_dir=i, save_dir = "./feature_extracted/", anomaly_train_txt = "Train_Normal", overwrite=False, category="random-crop", randCrop_version=j)

---
# 4. Check whether there are files missed

In [None]:
def checkFile(save_dir="./feature_extracted/train/10-crop/", anomaly_train_txt="./dataset/Anomaly_Train.txt", maxVersion=0):

    #================ get all video name in training data ================
    train_lists = []
    f = open(anomaly_train_txt, "r")
    for line in f: 
        name = line.split("/") #["Vandalism", "Vandalism027_x264.mp4\n"]
        basename = name[1].split(".") #["Vandalism027_x264", "mp4\n"]
        train_lists.append(basename[0]) #store "Vandalism027_x264"
    f.close()
    print(f"require {len(train_lists)*(maxVersion+1)} files ({len(train_lists)}*(maxVersion+1))")
    #================ get all saved .pt file ==============
    saved_names = []
    save_files = os.listdir(save_dir) #eg. save_files -> [Robbery1.pt, Robbery2.pt]
    for save_file in save_files:
        file_path = os.path.join(save_dir, save_file) #eg. video_path -> ./feature_extracted/Robbery102_x264.pt
        basename = os.path.basename(save_file) #Shoplifting021_x264.mp4
        file_name = os.path.splitext(basename)[0] #Shoplifting021_x264
        saved_names.append(file_name)
    print(f"saved {len(saved_names)} files")

    #================ check which file is missing ====================
    for i in train_lists:
        for version in range(maxVersion+1):
            filename = i+str("_version")+version #Vandalism027_x264_version0
            if filename not in saved_names: 
                print(filename, "is missing")

In [None]:
#============= check for train data (anomaly & normal) =============
checkFile(save_dir = "./feature_extracted/train/10-crop/", anomaly_train_txt = "./dataset/Anomaly_Train.txt", maxVersion=9) #version 0~9

require 1610
saved  1610


In [None]:
#=========== check for test data (anomaly & normal) =========
checkFile(save_dir = "./feature_extracted/test/10-crop/", anomaly_train_txt = "./dataset/Anomaly_Test.txt", maxVersion=9)

require 290
saved  290


---
# Convert groundtruth for every video into a 1D vector 

In [None]:
def write_1D_gt(test_dir, groundtruth_txt, save_path="./groundtruths.json"):
    """This function convert the groundtruths for all video into a 1D vector"""

    groundtruths = np.random.randint(1, size=1113424) #total frames in test videos are 1113424
    next_idx = 0
    f = open(groundtruth_txt, "r")
    for line in f: 
        #eg. line = Abuse028_x264.mp4  Abuse  165  240  -1  -1  
        annotation = line.split(" ") #eg. annotation = ['Abuse028_x264.mp4', '', 'Abuse', '', '165', '', '240', '', '-1', '', '-1', '', '\n']
        video_name = annotation[0][:-4] #Abuse028_x264
        print(video_name, end='\t')

        #load .pt to know how many frames that current video have
        path = os.path.join(test_dir, video_name+"_version0.pt")
        fea_clip = torch.load(path, map_location=torch.device('cpu')) #fea_clip, (c,nclip):(1024,?)

        #get number of frames 
        nframe = fea_clip.shape[1] * 16 
        print(" fea_clip.shape", fea_clip.shape, end='\t')
        print("nframe", nframe)

        #annotation[4]~annotation[6] is anomaly (anomaly score = 1)
        groundtruths[next_idx+int(annotation[4])-1:next_idx+int(annotation[6])-1] = 1

        #if the video have another anomaly moment
        if int(annotation[8]) != -1 and int(annotation[10]) != -1:
            groundtruths[next_idx+int(annotation[8])-1:next_idx+int(annotation[10])-1] = 1

        next_idx += nframe
    f.close()
    print("Total nframes from all test video:", next_idx)
    print("groundtruths.shape", groundtruths.shape)
    #write list to json file
    with open(save_path, "w") as fp:
        json.dump(groundtruths.tolist(), fp)
        print("saved to", save_path)


In [None]:
write_1D_gt(test_dir="./feature_extracted/test/10-crop", groundtruth_txt="./Temporal_Anomaly_Annotation_for_Testing_Videos.txt", save_path="./groundtruths.json")

Abuse028_x264	 fea_clip.shape torch.Size([10, 1024, 88])	nframe 1408
Abuse030_x264	 fea_clip.shape torch.Size([10, 1024, 96])	nframe 1536
Arrest001_x264	 fea_clip.shape torch.Size([10, 1024, 148])	nframe 2368
Arrest007_x264	 fea_clip.shape torch.Size([10, 1024, 196])	nframe 3136
Arrest024_x264	 fea_clip.shape torch.Size([10, 1024, 226])	nframe 3616
Arrest030_x264	 fea_clip.shape torch.Size([10, 1024, 540])	nframe 8640
Arrest039_x264	 fea_clip.shape torch.Size([10, 1024, 989])	nframe 15824
Arson007_x264	 fea_clip.shape torch.Size([10, 1024, 390])	nframe 6240
Arson009_x264	 fea_clip.shape torch.Size([10, 1024, 46])	nframe 736
Arson010_x264	 fea_clip.shape torch.Size([10, 1024, 197])	nframe 3152
Arson011_x264	 fea_clip.shape torch.Size([10, 1024, 79])	nframe 1264
Arson016_x264	 fea_clip.shape torch.Size([10, 1024, 112])	nframe 1792
Arson018_x264	 fea_clip.shape torch.Size([10, 1024, 52])	nframe 832
Arson022_x264	 fea_clip.shape torch.Size([10, 1024, 540])	nframe 8640
Arson035_x264	 fea_cl

In [None]:
with open('./groundtruths.json', 'rb') as fp:
    groundtruths = json.load(fp)
    print("groundtruths length:", len(groundtruths))

groundtruths length: 1113424


---
# demo

In [None]:
video_dir="./dataset/Abuse/"

#Orignal size (for train data only)
run(video_dir=video_dir, save_dir = "./feature_extracted/", anomaly_train_txt = "./dataset/Anomaly_Train.txt", overwrite=True, category="ori", randCrop_version=0)
#10-crop (for train/test data )
for i in range(10):
    run(video_dir=video_dir, save_dir = "./feature_extracted/", anomaly_train_txt = "./dataset/Anomaly_Train.txt", overwrite=True, category="10-crop", randCrop_version=i)
#random-crop (for train data only)
#random 7version
for i in range(7):
    run(video_dir=video_dir, save_dir = "./feature_extracted/", anomaly_train_txt = "./dataset/Anomaly_Train.txt", overwrite=True, category="random-crop", randCrop_version=i)

[1/2] Abuse002_x264.mp4	anomaly train	

  # This is added back by InteractiveShellApp.init_path()


segments_spatial_fea: torch.Size([1024, 32, 8, 10])	segments_pooling_fea: torch.Size([1024, 32])	spatial save path: ./feature_extracted/train/ori_spatial/Abuse002_x264.pt	save path: ./feature_extracted/train/ori/Abuse002_x264.pt
[2/2] Abuse028_x264.mp4	anomaly test	is test data, skip
[1/2] Abuse002_x264.mp4	anomaly train	segments_spatial_fea: torch.Size([1024, 32, 7, 9])	segments_pooling_fea: torch.Size([1024, 32])	spatial save path: ./feature_extracted/train/10-crop_spatial/Abuse002_x264_version0.pt	save path: ./feature_extracted/train/10-crop/Abuse002_x264_version0.pt
[2/2] Abuse028_x264.mp4	anomaly test	clips_spatial_fea: torch.Size([1024, 88, 7, 9])		clips_pooling_fea: torch.Size([1024, 88])	spatial save path: ./feature_extracted/test/10-crop_spatial/Abuse028_x264_version0.pt	save path: ./feature_extracted/test/10-crop/Abuse028_x264_version0.pt
[1/2] Abuse002_x264.mp4	anomaly train	segments_spatial_fea: torch.Size([1024, 32, 7, 9])	segments_pooling_fea: torch.Size([1024, 32])	spati