In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
cd "/content/gdrive/My Drive/fyp1"

/content/gdrive/My Drive/fyp1


In [None]:
! pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.28.1-py3-none-any.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting gradio-client>=0.1.3
  Downloading gradio_client-0.1.4-py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/286.7 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.22.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi
  Downloading fastapi-0.95.1-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image

In [None]:
class MaxPool3dSamePadding(nn.MaxPool3d):
    
    def compute_pad(self, dim, s):
        if s % self.stride[dim] == 0:
            return max(self.kernel_size[dim] - self.stride[dim], 0)
        else:
            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)

    def forward(self, x):
        # compute 'same' padding
        (batch, channel, t, h, w) = x.size()
        out_t = np.ceil(float(t) / float(self.stride[0]))
        out_h = np.ceil(float(h) / float(self.stride[1]))
        out_w = np.ceil(float(w) / float(self.stride[2]))
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)

        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f

        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        x = F.pad(x, pad)

        return super().forward(x)
    
class Unit3D(nn.Module):

    def __init__(self, in_channels,
                 output_channels,
                 kernel_shape=(1, 1, 1),
                 stride=(1, 1, 1),
                 padding=0,
                 activation_fn=F.relu,
                 use_batch_norm=True,
                 use_bias=False,
                 name='unit_3d'):
        
        """Initializes Unit3D module."""
        super(Unit3D, self).__init__()
        
        self._output_channels = output_channels
        self._kernel_shape = kernel_shape
        self._stride = stride
        self._use_batch_norm = use_batch_norm
        self._activation_fn = activation_fn
        self._use_bias = use_bias
        self.name = name
        self.padding = padding
        
        self.conv3d = nn.Conv3d(in_channels=in_channels,
                                out_channels=self._output_channels,
                                kernel_size=self._kernel_shape,
                                stride=self._stride,
                                padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
                                bias=self._use_bias)
        
        if self._use_batch_norm:
            self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)

    def compute_pad(self, dim, s):
        if s % self._stride[dim] == 0:
            return max(self._kernel_shape[dim] - self._stride[dim], 0)
        else:
            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)

            
    def forward(self, x):
        # compute 'same' padding
        (batch, channel, t, h, w) = x.size()
        out_t = np.ceil(float(t) / float(self._stride[0]))
        out_h = np.ceil(float(h) / float(self._stride[1]))
        out_w = np.ceil(float(w) / float(self._stride[2]))
        pad_t = self.compute_pad(0, t)
        pad_h = self.compute_pad(1, h)
        pad_w = self.compute_pad(2, w)

        pad_t_f = pad_t // 2
        pad_t_b = pad_t - pad_t_f
        pad_h_f = pad_h // 2
        pad_h_b = pad_h - pad_h_f
        pad_w_f = pad_w // 2
        pad_w_b = pad_w - pad_w_f

        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
        x = F.pad(x, pad)   

        x = self.conv3d(x)
        if self._use_batch_norm:
            x = self.bn(x)
        if self._activation_fn is not None:
            x = self._activation_fn(x)
            
        return x

class InceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels, name):
        super(InceptionModule, self).__init__()

        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
                         name=name+'/Branch_0/Conv3d_0a_1x1')
        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_1/Conv3d_0a_1x1')
        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_1/Conv3d_0b_3x3')
        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_2/Conv3d_0a_1x1')
        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
                          name=name+'/Branch_2/Conv3d_0b_3x3')
        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
                                stride=(1, 1, 1), padding=0)
        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
                          name=name+'/Branch_3/Conv3d_0b_1x1')
        self.name = name

    def forward(self, x):    
        b0 = self.b0(x)
        b1 = self.b1b(self.b1a(x))
        b2 = self.b2b(self.b2a(x))
        b3 = self.b3b(self.b3a(x))
        return torch.cat([b0,b1,b2,b3], dim=1)

class InceptionI3d(nn.Module):
    """Inception-v1 I3D architecture.
    The model is introduced in:
        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
        Joao Carreira, Andrew Zisserman
        https://arxiv.org/pdf/1705.07750v1.pdf.
    See also the Inception architecture, introduced in:
        Going deeper with convolutions
        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
        http://arxiv.org/pdf/1409.4842v1.pdf.
    """

    # Endpoints of the model in order. During construction, all the endpoints up
    # to a designated `final_endpoint` are returned in a dictionary as the
    # second return value.
    VALID_ENDPOINTS = (
        'Conv3d_1a_7x7',
        'MaxPool3d_2a_3x3',
        'Conv3d_2b_1x1',
        'Conv3d_2c_3x3',
        'MaxPool3d_3a_3x3',
        'Mixed_3b',
        'Mixed_3c',
        'MaxPool3d_4a_3x3',
        'Mixed_4b',
        'Mixed_4c',
        'Mixed_4d',
        'Mixed_4e',
        'Mixed_4f',
        'MaxPool3d_5a_2x2',
        'Mixed_5b',
        'Mixed_5c',
        'Logits',
        'Predictions',
    )

    def __init__(self, num_classes=400, spatial_squeeze=True,
                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
        """Initializes I3D model instance.
        Args:
          num_classes: The number of outputs in the logit layer (default 400, which
              matches the Kinetics dataset).
          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
              before returning (default True).
          final_endpoint: The model contains many possible endpoints.
              `final_endpoint` specifies the last endpoint for the model to be built
              up to. In addition to the output at `final_endpoint`, all the outputs
              at endpoints up to `final_endpoint` will also be returned, in a
              dictionary. `final_endpoint` must be one of
              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
          name: A string (optional). The name of this module.
        Raises:
          ValueError: if `final_endpoint` is not recognized.
        """

        if final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % final_endpoint)

        super(InceptionI3d, self).__init__()
        self._num_classes = num_classes
        self._spatial_squeeze = spatial_squeeze
        self._final_endpoint = final_endpoint
        self.logits = None

        if self._final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)

        self.end_points = {}
        end_point = 'Conv3d_1a_7x7'
        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
        if self._final_endpoint == end_point: return
        
        end_point = 'MaxPool3d_2a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return
        
        end_point = 'Conv3d_2b_1x1'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return
        
        end_point = 'Conv3d_2c_3x3'
        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
                                       name=name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_3a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return
        
        end_point = 'Mixed_3b'
        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_3c'
        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_4a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4b'
        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4c'
        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4d'
        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4e'
        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_4f'
        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'MaxPool3d_5a_2x2'
        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
                                                             padding=0)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5b'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Mixed_5c'
        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
        if self._final_endpoint == end_point: return

        end_point = 'Logits'
        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
                                     stride=(1, 1, 1))
        self.dropout = nn.Dropout(dropout_keep_prob)
        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')

        self.build()


    def replace_logits(self, num_classes):
        self._num_classes = num_classes
        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')
        
    def build(self):
        for k in self.end_points.keys():
            self.add_module(k, self.end_points[k])
        
    def forward(self, x):
        print("inside forward")
        for end_point in self.VALID_ENDPOINTS:
            if end_point in self.end_points:
                print(f"forwarding {end_point}")
                x = self._modules[end_point](x) # use _modules to work with dataparallel
        print(f"before logit shape {x.shape}")
        x = self.logits(self.dropout(self.avg_pool(x)))
        print(f"after logit shape {x.shape}")
        if self._spatial_squeeze:
            logits = x.squeeze(3).squeeze(3)
            print(f"after squeeeze shape {logits.shape}")
        # logits is batch X time X classes, which is what we want to work with
        return logits

    def extract_features(self, x, myEndPoint):
        #x (original size of frame) = (b,c,nframe,h,w):(1,3,16,240,320)
        #x_cropped                  = (b,c,nframe,h,w):(1,3,16,210,280)

        #I3D feature extraction
        for end_point in self.VALID_ENDPOINTS:
            if end_point in self.end_points:
                x = self._modules[end_point](x)
            if end_point == myEndPoint:
                break
        #====================================
        # x         => (b,c,t,h,w):(1,1024,2,8,10)
        # x_cropped => (b,c,t,h,w):(1,1024,2,7,9)
        x = torch.norm(x, dim=2) #l2 normalization for t dimension
        # x         => (b,c,t,h,w):(1,1024,1,8,10)
        # x_cropped => (b,c,t,h,w):(1,1024,1,7,9)
        spatial_fea = torch.squeeze(x) 
        # spatial_fea         => (c,h,w):(1024,8,10)
        # spatial_fea_cropped => (c,h,w):(1024,7,9)

        x = F.adaptive_avg_pool2d(x, (1, 1)) #global pooling
        # x         => (b,c,t,h,w):(1,1024,1,1,1)
        # x_cropped => (b,c,t,h,w):(1,1024,1,1,1)
        x = torch.squeeze(x) #sequeeze batch(for ori/random-crop data) and spatial dimension
        # x         => (c,):(1024,)
        # x_cropped => (c,):(1024,)
        pooling_fea = x
        
        return spatial_fea, pooling_fea
        #============================================
        # spatial_fea         => (c,h,w):(1024,8,10)
        # spatial_fea_cropped => (c,h,w):(1024,7,9)
        # pooling_fea         => (c,):(1024,)
        # pooling_fea_cropped => (c,):(1024,)
        #============================================

def custom_I3D(mode='rgb', load_model='./rgb_imagenet.pt'):
    # setup the model
    if mode == 'flow':
        i3d = InceptionI3d(400, in_channels=2)
    else:
        i3d = InceptionI3d(400, in_channels=3)
    i3d.load_state_dict(torch.load(load_model))
    return i3d
def feature_extract(i3d, x, myEndPoint):
    #get available device
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    #pass to device available
    i3d = i3d.to(device)
    x = x.to(device)

    i3d.eval()  # Set model to evaluate mode
    #x (original size of frame) = (b,c,nframe,h,w):(1,3,16,240,320)
    #x_cropped                  = (b,c,nframe,h,w):(1,3,16,210,280)
    with torch.no_grad():
        spatial_fea, pooling_fea = i3d.extract_features(x, myEndPoint)
    
    return spatial_fea, pooling_fea
    #============================================
    # spatial_fea         => (c,h,w):(1024,8,10)
    # spatial_fea_cropped => (c,h,w):(1024,7,9)
    # pooling_fea         => (c,):(1024,)
    # pooling_fea_cropped => (c,):(1024,)
    #============================================
transform = transforms.Compose([
    #transforms.Resize((240,320)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


# Contrastive Unet model

In [None]:
class Contrastive_head(nn.Module):
    def __init__(self, num_center=2, in_channels=32, lambda_1=0.0001, beta=0.0001, m=1.25):
        super().__init__()
        
        self.num_class = 2
        self.num_center = num_center #number of center per class
        self.in_channels = in_channels
        self.lambda_1 = lambda_1
        self.beta = beta
        self.m = torch.tensor(m)
        self.Centers = nn.Parameter(torch.randn(2, num_center, self.in_channels), requires_grad=True) #(nclass,ncenter,c)

    def forward(self, x, device):
        #x = (b*3,c,k)
        #top 3 score from normal video
        #last 3 score from anomaly video
        #top 3 score from anomaly video
        
        # compute intra distance
        intra_distance = torch.mean(self.compute_intra_dist(x[:int(x.shape[0]/3)], self.Centers[0]) + \
                                    self.compute_intra_dist(x[int(x.shape[0]/3):int(x.shape[0]/3*2)], self.Centers[0]) + \
                                    self.compute_intra_dist(x[int(x.shape[0]/3*2):], self.Centers[1]))
        
        # compute inter distance
        inter_distance = self.compute_inter_dist(self.Centers[0], self.Centers[1], self.m, device) 

        # compute multicenter loss
        mc_loss = self.lambda_1 * intra_distance + self.beta * inter_distance

        return mc_loss

    def compute_intra_dist(self, x, centers):
        #x = (b,c,nsegment)
        centers2 = centers.repeat(x.shape[0], x.shape[2], 1, 1)
        #centers2 = (b, nsegment, ncenter, c)
        centers2 = centers2.permute(2,0,3,1)
        #centers2 = (ncenter,b,c,nsegment)
        dist_ctr_fea = (centers2 - x).square().sum(dim=2).sqrt()
        #dist_ctr_fea = (ncenter, b, nsegment)    
        min_dist_fea_ctr, idx = torch.min(dist_ctr_fea, dim=0)
        #min_dist_fea_ctr = (b, nsegment)
        sum_dist_fea_ctr = torch.mean(min_dist_fea_ctr, dim=1)
        #sum_dist_fea_ctr = (b)
        return sum_dist_fea_ctr

    def compute_inter_dist(self, norm_centers, anom_centers, m, device):
        #norm_centers = (ncenter, c)
        #anom_centers = (ncenter, c)
        total_ctr = norm_centers.shape[0] + anom_centers.shape[0] # (ncenter+ncenter)
        centers = torch.cat((norm_centers, anom_centers), dim=0)
        #centers = (ncenter+ncenter, c)
        centers2 = centers.unsqueeze(0)
        #centers = (1, ncenter, c)
        dist = torch.cdist(centers2, centers2).squeeze()
        #dist = (total_ctr,total_ctr)
        margin = m.repeat(dist.shape[0], dist.shape[1]) #margin2 = (total_ctr,total_ctr)
        margin = margin.to(device)
        zeros = torch.zeros_like(dist) #zeros = (total_ctr,total_ctr)
        zeros = zeros.to(device)
        inter_dist = torch.max(zeros, margin-dist).sum() / (total_ctr*(total_ctr-1)) 
        return inter_dist

# Reference: https://amaarora.github.io/2020/09/13/unet.html

class Block(nn.Module):
    def __init__(self, in_ch, out_ch):
        
        super().__init__()

        self.in_ch = in_ch
        self.out_ch = out_ch        
        
        self.conv1 = nn.Conv1d (in_ch, out_ch, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv1d(out_ch, out_ch, kernel_size=5, stride=1, padding=2)        
        self.conv3 = nn.Conv1d(out_ch, out_ch, kernel_size=5, stride=1, padding=2)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        r = x
        r = self.conv2(r)
        r = F.relu(r)
        r = self.conv3(r)
        x = x + r
        x = F.relu(x)
        return x
        
class Encoder(nn.Module):
    def __init__(self, in_channels, blk_channels):
        super().__init__()
        self.in_channels = in_channels
        self.blk_channels = blk_channels
        channels = [in_channels] + list(blk_channels)
        self.enc_blocks = nn.ModuleList([Block(channels[i], channels[i+1]) for i in range(len(channels)-1)])
    def forward(self, x):
        enc_out = []
        for i, block in enumerate(self.enc_blocks):
            x = block(x)
            enc_out.append(x)
            if i < len(self.enc_blocks)-1:
                x = F.max_pool1d(x, 2, 2)
        return enc_out
    
class Decoder(nn.Module):
    def __init__(self, blk_channels):
        super().__init__()
        self.channels = blk_channels[::-1]  # reverse the blk_channels
        self.upconvs = nn.ModuleList ([nn.ConvTranspose1d(self.channels[i], self.channels[i+1], kernel_size=2, stride=2) 
                                       for i in range(len(self.channels)-1)])
        self.dec_blocks = nn.ModuleList([Block(self.channels[i]*2, self.channels[i+1]) 
                                         for i in range(len(self.channels)-1)]) 
        
    def crop(self, enc_fea, x):
        _, _, L = x.shape
        _, _, Le = enc_fea.shape
        start = (Le - L)//2
        return enc_fea[:,:, start:start+L]
        
    def forward(self, enc_out, inputs):
        enc_out = enc_out[::-1]  # reverse the order
        x = enc_out[0]
        enc_out = enc_out[1:]
        for i in range(len(self.channels)-1):
            x1 = self.upconvs[i](x)
            x2 = self.crop(enc_out[i], x1)
            x = torch.cat([x1, x2], dim=1)
            x = self.dec_blocks[i](x)
        x = F.interpolate(x, inputs.shape[-1])
        return x
    
class Head(nn.Module):
    def __init__ (self, channels):
        super().__init__()
        self.layers = nn.ModuleList([nn.Conv1d(channels[i], channels[i+1], kernel_size=1) for i in range(len(channels)-1)])
        self.dropout = nn.Dropout(p=0.7) 
        
    def forward(self, x, isTraining):
        for i, layer in enumerate(self.layers):
            x = layer(x)
            if i == len(self.layers) - 1:
                x = torch.sigmoid(x)
            else:
                x = F.relu(x)
                if isTraining == True:
                    x = self.dropout(x)
                last_fea = x
        return last_fea, x
        
class Model(nn.Module):
    def __init__(self, in_channels=1024, 
                 blk_channels  = (1024, 1024, 1024, 1024),  
                 head_channels = (1024, 512, 32, 1)):
        super().__init__()
        self.in_channels  = in_channels
        self.blk_channels = blk_channels
        self.encoder = Encoder(in_channels, blk_channels)
        self.decoder = Decoder(blk_channels)
        self.head    = Head(head_channels)
        self.contrastive_head = Contrastive_head(num_center=16, in_channels=32, lambda_1=0.00224, beta=0.2304, m=1.25)

    def forward(self, inputs, isTraining=False, device="cpu"):
        #inputs = (b*2,c,nsegment):(?,1024,32)
        enc_output = self.encoder(inputs)               
        dec_output = self.decoder(enc_output, inputs)      
        last_fea, segment_scores = self.head(dec_output, isTraining)
        #last_fea = (b*2, 32, nsegment), features just before classification layer
        #segment_scores = (b*2, 1, nsegment)
        if isTraining == True:
            """#============= loss function =============="""
            bce, idx_topK_anom, idx_lastK_anom, idx_topK_norm = self.myLossFunction(segment_scores)
            #idx_topK_anom = (b,ksegment)

            """#============== get topk & lastk feature ===================="""
            last_fea = last_fea.to("cpu")
            last_fea = last_fea.permute(1,0,2) #convert (b*2, 32, nsegment) to (32, b*2, nsegment)
            normal_features = last_fea[:,:int(last_fea.shape[1]/2)] #segment feature in normal video 
            anomaly_features = last_fea[:,int(last_fea.shape[1]/2):] #segment feature in anomaly video 
            #normal_features = (32, b, nsegment)
            #anomaly_features = (32, b, nsegment)

            total_topk_norm_feature = torch.zeros(0)
            total_topk_abn_feature = torch.zeros(0)
            total_lastk_abn_feature = torch.zeros(0)
            #get topk features from normal video
            for normal_feature in normal_features: 
                topk_norm_feature = torch.gather(normal_feature, 1, idx_topK_norm)     
                #topk_norm_feature = (b,k)
                topk_norm_feature = torch.unsqueeze(topk_norm_feature, 0)
                #topk_norm_feature = (1,b,k)
                total_topk_norm_feature = torch.cat((total_topk_norm_feature, topk_norm_feature), dim=0)  

            #get topk and lastk features from anomaly video
            for abnormal_feature in anomaly_features: 
                topk_abn_feature = torch.gather(abnormal_feature, 1, idx_topK_anom)  
                lastk_abn_feature = torch.gather(abnormal_feature, 1, idx_lastK_anom)   
                #topk_abn_feature = (b,k)
                topk_abn_feature = torch.unsqueeze(topk_abn_feature, 0)
                lastk_abn_feature = torch.unsqueeze(lastk_abn_feature, 0)
                #topk_abn_feature = (1,b,k)
                total_topk_abn_feature = torch.cat((total_topk_abn_feature, topk_abn_feature), dim=0)  
                total_lastk_abn_feature = torch.cat((total_lastk_abn_feature, lastk_abn_feature), dim=0) 
            
            #total_topk_norm_feature = (c,b,k)
            #total_topk_abn_feature = (c,b,k)
            #total_lastk_abn_feature = (c,b,k)
            selected_feature = torch.cat((total_topk_norm_feature, total_lastk_abn_feature, total_topk_abn_feature), dim=1) #first half is lastK features, second half is topK features 
            selected_feature = selected_feature.permute(1,0,2) #convert (c,b*3,k) to (b*3,c,k)
            selected_feature = selected_feature.to(device)
            """#============== contrastive regularization =====================""" 
            mc_loss = self.contrastive_head(selected_feature, device)
            combination_loss = bce + mc_loss
            return combination_loss
        else:
            return last_fea, segment_scores 
            #last_fea = (b*2, c, nsegment)
            #segment_scores=(b*2, 1, nsegment)

    def myLossFunction(self, segment_scores, k=3, lambda_1=0.00008, lambda_2=0.00008):
        #convert (b,1,nsegment) to (b,nsegment)
        segment_scores = torch.squeeze(segment_scores, 1) 

        #to cpu
        segment_scores = segment_scores.to("cpu")

        #segment_scores = (b*2, nsegment)
        normal_scores = segment_scores[:int(segment_scores.shape[0]/2)]
        #normal_scores (b,nsegment)

        anomaly_scores = segment_scores[int(segment_scores.shape[0]/2):]
        #anomaly_scores (b,nsegment)
        #========== get top-k segment score ============
        topk_norm_scores, idx_topK_norm = torch.topk(normal_scores, k=k, dim=1, largest=True)   #top-k normal score (b,k)
        topk_anom_scores, idx_topK_anom = torch.topk(anomaly_scores, k=k, dim=1, largest=True) #top-k anomaly score (b,k)
        _, idx_lastK_anom = torch.topk(anomaly_scores, k=k, dim=1, largest=False) #last-k anomaly score (b,k)
        topk_scores = torch.cat((topk_norm_scores, topk_anom_scores), dim=0) #(b*2,k)

        #initialize label
        zeros = torch.zeros_like(topk_norm_scores) #(b,k)
        ones = torch.ones_like(topk_anom_scores) #(b,k)
        label = torch.cat((zeros, ones), dim=0) #(b*2,k)

        #=========== binary cross entropy loss ====================
        bce = F.binary_cross_entropy(topk_scores, label)  

        anomaly_scores = anomaly_scores.permute(1,0) #(nsegment,b)
        normal_scores = normal_scores.permute(1,0) #(nsegment,b)
        
        #========= get temporal smoothness ==========
        diff_neighbour = torch.randn(anomaly_scores.shape[0]-1, anomaly_scores.shape[1]) # (nsegment-1, b) difference of 2 neighbour segment
        for i in range(anomaly_scores.shape[0]-1): # #segments-1
            diff_neighbour[i] = torch.pow((anomaly_scores[i] - anomaly_scores[i+1]), 2) 

        temp_smoothness = lambda_1 * torch.sum(diff_neighbour, dim=0, keepdim=True) 

        #========= get sparsity ==========
        sparsity = lambda_2 * torch.sum(anomaly_scores, dim=0, keepdim=True) #(1, b)

        #final cost 
        cost = bce + torch.mean(temp_smoothness) + torch.mean(sparsity) 

        return cost, idx_topK_anom, idx_lastK_anom, idx_topK_norm

In [None]:
import gradio as gr
import os
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt
def fea_extract(video, i3d, randCrop_version):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    anomaly_scores = []
    tenCrop_loc = [[0,0], [0,40], [30,0], [30,40], [15,20],
                [0,0], [0,40], [30,0], [30,40], [15,20]]
    height_top = tenCrop_loc[randCrop_version][0]
    height_bottom = height_top + 210
    width_left = tenCrop_loc[randCrop_version][1]
    width_right = width_left + 280

    if randCrop_version <= 4:
        random_flip = False
    else: #for version 5~9
        random_flip = True

    f = [] #store 16-frame / 1 clip. reset to empty after feature extracted
    total_frame = 0 #total number of frame
    curr_nclip = 0   #current number of clip
    max_nclip = 21000 #WE ASSUME THE MAXIMUM #CLIP FOR A VIDEO IS 21000 CLIPS (enough for 3 hour long video) approximately 6GB
    isFirstClip = True
    
    #============== create tensor for transformed 16-frame ============
    sixteen_frames = torch.randn(16,3,210,280) #store tensor that transformed from PIL image. (nframe,c,h,w):(16,3,210,280)

    #============== load video ===============
    cap = cv.VideoCapture(video)
    #if video cannot open
    if (cap.isOpened()== False):
        raise Exception("Error opening video file")

    while(cap.isOpened()):
        # Capture frame-by-frame
        ret, frame = cap.read()

        #if video not end yet OR total_frame less than 512. (minimum nframes = 32 segment * 16 frames ==> 512)
        if ret == True or total_frame < 512:

            #if video end but total_frame less than 512
            if ret == False and total_frame < 512:
                frame = np.zeros((240, 320 , 3), dtype=np.uint8) #create black blank frame

            total_frame += 1 

            #================ crop frame ================
            frame = frame[height_top:height_bottom, width_left:width_right]
            #================ flip frame ================
            if random_flip == True:
                frame = cv.flip(frame, 1) #flip horizontally

            #rescale pixel value from 0~255 to 0~1
            #frame = frame/255

            #convert to PIL image
            frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)

            #store current frame into a clip
            f.append(frame)

            #================ for each 16-frames, extract I3D feature ================
            if len(f) == 16:
                for i, frame in enumerate(f):
                    #================ transformation ================
                    frame = transform(frame)
                    sixteen_frames[i] = frame  #torch.cat()  is time consuming !!! 
                    
                frames = torch.transpose(sixteen_frames, 0, 1) #convert (nframe,c,h,w) to (c,nframe,h,w)
                frames = torch.unsqueeze(frames, 0) #add batch dimension --> frames = (b,c,nframe,h,w)
                frames = frames.to(device) #pass to device available

                #================ extract I3D feature ================
                with torch.no_grad():
                    spatial_fea, pooling_fea = feature_extract(i3d, frames, myEndPoint="Mixed_5c") 
                    pooling_fea = pooling_fea.to("cpu")
                    # pooling_fea (ori/random-crop) => feature with summarized spatial dimension (c,):(1024,)
                    if torch.any(torch.isnan(spatial_fea)).item() == True or torch.any(torch.isnan(pooling_fea)).item() == True:
                        raise Exception("[!] nan value exist after forward to model")
                    if isFirstClip == True:
                        clips_pooling_fea = torch.randn((max_nclip, 1024)) #clips_pooling_fea=(nclip,c):(?, 1024)
                        isFirstClip = False
                    clips_pooling_fea[curr_nclip] = pooling_fea.clone().detach()   #torch.cat() is time consuming

                curr_nclip += 1

                if curr_nclip >= max_nclip:
                    raise Exception("[!] total number of clips exceed defined maximum number of clips")

                f = [] #reset list that store 16-frames
        #if video end AND total_frame more than 512
        else:
            if total_frame//16 != curr_nclip:
                raise Exception("[!] total_frame//16 != curr_nclip. total_frame", total_frame, "curr_nclip", curr_nclip)
            if curr_nclip < 32:
                raise Exception("[!] Required at least 32 clips but only having", curr_nclip)
            clips_pooling_fea = clips_pooling_fea[:curr_nclip, :].clone().detach() #remove unwanted memory 
            cap.release()
            return clips_pooling_fea #clips_pooling_fea=(nclip,c):(?, 1024)

def predict_10crop(video):
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    #================= setting =============================
    saved_weight = "./saved_model/top3pos_last3pos_top3neg_16centers_L0.00256_m1.25_b0.2304_1024c_B64/highestAUC.pt" #85.24
    #=======================================================
    model = Model()
    model.load_state_dict(torch.load(saved_weight, map_location='cuda:0'))
    model.to(device)
    model.eval()
    i3d = custom_I3D()
    i3d.to(device)
    i3d.eval()
    #========================================================

    for i in range(10):
        x = fea_extract(video, i3d, randCrop_version=i)
        #x = (nclip,c):(?, 1024)
        if i == 0:
            tenCrop_fea = torch.randn((10,1024, x.shape[0]), device=device)
        tenCrop_fea[i] = x.transpose(0,1)
        #tenCrop_fea = (ncrop,c,nclip):(10,1024,?)
    #=============== predict =============
    with torch.no_grad():
        fea, scores = model(tenCrop_fea)
        #score = (ncrop,1,nclip)
    scores = scores.to("cpu")
    scores = torch.squeeze(scores)
    #score = (ncrop,nclip)
    #================= mean ======================
    scores = torch.mean(scores, dim=0)
    
    return scores #scores = (nclip,)

def writeVideo(video, scores):
    scores = np.round(scores, 4)
    val, idx = torch.topk(torch.tensor(scores), 5)
    val = val.numpy()
    scores = np.repeat(np.array(scores), 16) #from clip-level score to frame-level score
    cap = cv.VideoCapture(video)
    
    if (cap.isOpened() == False): 
        print("Unable to read camera feed")
    
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    
    # Define the codec and create VideoWriter object.The output is stored in 'outpy.avi' file.
    writer = cv.VideoWriter('./outputVideo.mp4', cv.VideoWriter_fourcc('M','J','P','G'), 30, (frame_width,frame_height))
    currentFrame = 0
    while(True):
        ret, frame = cap.read()
        currentFrame = currentFrame + 1
        if currentFrame > len(scores):
            break
        
        if ret == True: 
            #============ put background text =================
            text = "Frame:"+str(currentFrame)
            text_top_left = (3, 15)
            font = cv.FONT_HERSHEY_SIMPLEX
            fontScale = 0.6
            color = (0,0,0)
            thickness = 2
            cv.putText(frame, text, text_top_left, font, fontScale, color, thickness, cv.LINE_AA, False)
            #============= put Frame:1 =============
            text = "Frame:"+str(currentFrame)
            text_top_left = (3, 15)
            font = cv.FONT_HERSHEY_SIMPLEX
            fontScale = 0.6
            color = (0,255,0)
            thickness = 1
            cv.putText(frame, text, text_top_left, font, fontScale, color, thickness, cv.LINE_AA, False)
            #============ put background text ===========
            text = "Score:"+str(scores[int(currentFrame-1)])
            text_top_left = (3, 30)
            font = cv.FONT_HERSHEY_SIMPLEX
            fontScale = 0.6
            color = (0,0,0)
            thickness = 2
            cv.putText(frame, text, text_top_left, font, fontScale, color, thickness, cv.LINE_AA, False)
            #============= put Score:1 =============
            text = "Score:"+str(scores[int(currentFrame-1)])
            text_top_left = (3, 30)
            font = cv.FONT_HERSHEY_SIMPLEX
            fontScale = 0.6
            #if scores[int(currentFrame-1)] in val:
            if scores[int(currentFrame-1)] > 0.5:
                color = (0,0,255)
            else:
                color = (0,255,0)
            thickness = 1
            cv.putText(frame, text, text_top_left, font, fontScale, color, thickness, cv.LINE_AA, False)

            #============ write frame ==============
            writer.write(frame)
        else:
            break 
    
    # When everything done, release the video capture and video write objects
    cap.release()
    writer.release()

def plot(pred_score):
    """plot graph for a given scores, annotation"""
   
    pred_score = np.repeat(np.array(pred_score), 16) #change predicted score from clip-level to frame-level
    x = [(i+1) for i in range(pred_score.shape[0])]
    
    fig, ax = plt.subplots()
    plt.plot(x, pred_score.tolist()) #label="Predicted Anomaly Score"
    ax.set_ylim(0, 1)
    plt.ylabel("Anomaly Score")
    plt.title('Anomaly Score VS Frame Number')
    plt.xlabel("Frame Number")
    #plt.legend() #show label
    #plt.show()
    plt.savefig('./my_plot.png')
    fig = cv.imread('./my_plot.png')
    return fig

def video_identity(video):
    anomaly_score = predict_10crop(video)
    fig = plot(anomaly_score)
    writeVideo(video, anomaly_score)
    video = "./outputVideo.mp4"
    return video, fig


In [None]:
#gradio interface might have problem to predict anomaly score for long duration video. The solution is to run the code below.
#video_identity("./Arson016_x264.mp4") #predict anomaly score without gradio interface

In [None]:
demo = gr.Interface(video_identity, 
                    gr.Video(), 
                    ["playable_video", "image"], 
                    cache_examples=True,
                    allow_flagging = "never") 

if __name__ == "__main__":
    demo.launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>