In [2]:
import math
import os
import copy
import numpy as np
import torch
from torch.nn import ReplicationPad3d
import warnings
from collections import OrderedDict
#from torch._six import container_abcs
import collections.abc as container_abcs
from itertools import islice
import operator

import torch


class ModuleList(torch.nn.Module):
    r"""Holds submodules in a list.

    ModuleList can be indexed like a regular Python list, but modules it
    contains are properly registered, and will be visible by all Module methods.

    Arguments:
        modules (iterable, optional): an iterable of modules to add
df
    Example::

        class MyModule(nn.Module):
            def __init__(self):
                super(MyModule, self).__init__()
                self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

            def forward(self, x):
                # ModuleList can act as an iterable, or be indexed using ints
                for i, l in enumerate(self.linears):
                    x = self.linears[i // 2](x) + l(x)
                return x
    """

    def __init__(self, modules=None):
        super(ModuleList, self).__init__()
        if modules is not None:
            self += modules

    def _get_abs_string_index(self, idx):
        """Get the absolute index for the list of modules"""
        idx = operator.index(idx)
        if not (-len(self) <= idx < len(self)):
            raise IndexError('index {} is out of range'.format(idx))
        if idx < 0:
            idx += len(self)
        return str(idx)

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return self.__class__(list(self._modules.values())[idx])
        else:
            return self._modules[self._get_abs_string_index(idx)]

    def __setitem__(self, idx, module):
        idx = self._get_abs_string_index(idx)
        return setattr(self, str(idx), module)

    def __delitem__(self, idx):
        if isinstance(idx, slice):
            for k in range(len(self._modules))[idx]:
                delattr(self, str(k))
        else:
            delattr(self, self._get_abs_string_index(idx))
        # To preserve numbering, self._modules is being reconstructed with modules after deletion
        str_indices = [str(i) for i in range(len(self._modules))]
        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))

    def __len__(self):
        return len(self._modules)

    def __iter__(self):
        return iter(self._modules.values())

    def __iadd__(self, modules):
        return self.extend(modules)

    def __dir__(self):
        keys = super(ModuleList, self).__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def insert(self, index, module):
        r"""Insert a given module before a given index in the list.

        Arguments:
            index (int): index to insert.
            module (nn.Module): module to insert
        """
        for i in range(len(self._modules), index, -1):
            self._modules[str(i)] = self._modules[str(i - 1)]
        self._modules[str(index)] = module

    def append(self, module):
        r"""Appends a given module to the end of the list.

        Arguments:
            module (nn.Module): module to append
        """
        self.add_module(str(len(self)), module)
        return self

    def extend(self, modules):
        r"""Appends modules from a Python iterable to the end of the list.

        Arguments:
            modules (iterable): iterable of modules to append
        """
        if not isinstance(modules, container_abcs.Iterable):
            raise TypeError("ModuleList.extend should be called with an "
                            "iterable, but got " + type(modules).__name__)
        offset = len(self)
        for i, module in enumerate(modules):
            self.add_module(str(offset + i), module)
        return self


def get_padding_shape(filter_shape, stride):
    def _pad_top_bottom(filter_dim, stride_val):
        pad_along = max(filter_dim - stride_val, 0)
        pad_top = pad_along // 2
        pad_bottom = pad_along - pad_top
        return pad_top, pad_bottom

    padding_shape = []
    for filter_dim, stride_val in zip(filter_shape, stride):
        pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val)
        padding_shape.append(pad_top)
        padding_shape.append(pad_bottom)
    depth_top = padding_shape.pop(0)
    depth_bottom = padding_shape.pop(0)
    padding_shape.append(depth_top)
    padding_shape.append(depth_bottom)

    return tuple(padding_shape)


def simplify_padding(padding_shapes):
    all_same = True
    padding_init = padding_shapes[0]
    for pad in padding_shapes[1:]:
        if pad != padding_init:
            all_same = False
    return all_same, padding_init


class Unit3Dpy(torch.nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=(1, 1, 1),
                 stride=(1, 1, 1),
                 activation='relu',
                 padding='SAME',
                 use_bias=False,
                 use_bn=True):
        super(Unit3Dpy, self).__init__()

        self.padding = padding
        self.activation = activation
        self.use_bn = use_bn
        if padding == 'SAME':
            padding_shape = get_padding_shape(kernel_size, stride)
            simplify_pad, pad_size = simplify_padding(padding_shape)
            self.simplify_pad = simplify_pad
        elif padding == 'VALID':
            padding_shape = 0
        else:
            raise ValueError(
                'padding should be in [VALID|SAME] but got {}'.format(padding))

        if padding == 'SAME':
            if not simplify_pad:
                self.pad = torch.nn.ConstantPad3d(padding_shape, 0)
                self.conv3d = torch.nn.Conv3d(
                    in_channels,
                    out_channels,
                    kernel_size,
                    stride=stride,
                    bias=use_bias)
            else:
                self.conv3d = torch.nn.Conv3d(
                    in_channels,
                    out_channels,
                    kernel_size,
                    stride=stride,
                    padding=pad_size,
                    bias=use_bias)
        elif padding == 'VALID':
            self.conv3d = torch.nn.Conv3d(
                in_channels,
                out_channels,
                kernel_size,
                padding=padding_shape,
                stride=stride,
                bias=use_bias)
        else:
            raise ValueError(
                'padding should be in [VALID|SAME] but got {}'.format(padding))

        if self.use_bn:
            self.batch3d = torch.nn.BatchNorm3d(out_channels)

        if activation == 'relu':
            self.activation = torch.nn.functional.relu

    def forward(self, inp):
        if self.padding == 'SAME' and self.simplify_pad is False:
            inp = self.pad(inp)
        out = self.conv3d(inp)
        if self.use_bn:
            out = self.batch3d(out)
        if self.activation is not None:
            out = torch.nn.functional.relu(out)
        return out


class MaxPool3dTFPadding(torch.nn.Module):
    def __init__(self, kernel_size, stride=None, padding='SAME'):
        super(MaxPool3dTFPadding, self).__init__()
        if padding == 'SAME':
            padding_shape = get_padding_shape(kernel_size, stride)
            self.padding_shape = padding_shape
            self.pad = torch.nn.ConstantPad3d(padding_shape, 0)
        self.pool = torch.nn.MaxPool3d(kernel_size, stride, ceil_mode=True)

    def forward(self, inp):
        inp = self.pad(inp)
        out = self.pool(inp)
        return out


class Mixed(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Mixed, self).__init__()
        # Branch 0
        self.branch_0 = Unit3Dpy(
            in_channels, out_channels[0], kernel_size=(1, 1, 1))

        # Branch 1
        branch_1_conv1 = Unit3Dpy(
            in_channels, out_channels[1], kernel_size=(1, 1, 1))
        branch_1_conv2 = Unit3Dpy(
            out_channels[1], out_channels[2], kernel_size=(3, 3, 3))
        self.branch_1 = torch.nn.Sequential(branch_1_conv1, branch_1_conv2)

        # Branch 2
        branch_2_conv1 = Unit3Dpy(
            in_channels, out_channels[3], kernel_size=(1, 1, 1))
        branch_2_conv2 = Unit3Dpy(
            out_channels[3], out_channels[4], kernel_size=(3, 3, 3))
        self.branch_2 = torch.nn.Sequential(branch_2_conv1, branch_2_conv2)

        # Branch3
        branch_3_pool = MaxPool3dTFPadding(
            kernel_size=(3, 3, 3), stride=(1, 1, 1), padding='SAME')
        branch_3_conv2 = Unit3Dpy(
            in_channels, out_channels[5], kernel_size=(1, 1, 1))
        self.branch_3 = torch.nn.Sequential(branch_3_pool, branch_3_conv2)

    def forward(self, inp):
        out_0 = self.branch_0(inp)
        out_1 = self.branch_1(inp)
        out_2 = self.branch_2(inp)
        out_3 = self.branch_3(inp)
        out = torch.cat((out_0, out_1, out_2, out_3), 1)
        return out


class I3D(torch.nn.Module):
    def __init__(self,
                 num_classes,
                 modality='rgb',
                 dropout_prob=0,
                 name='inception'):
        super(I3D, self).__init__()

        self.name = name
        self.num_classes = num_classes
        if modality == 'rgb':
            in_channels = 3
        elif modality == 'flow':
            in_channels = 2
        else:
            raise ValueError(
                '{} not among known modalities [rgb|flow]'.format(modality))
        self.modality = modality

        conv3d_1a_7x7 = Unit3Dpy(
            out_channels=64,
            in_channels=in_channels,
            kernel_size=(7, 7, 7),
            stride=(2, 2, 2),
            padding='SAME')
        # 1st conv-pool
        self.conv3d_1a_7x7 = conv3d_1a_7x7
        self.maxPool3d_2a_3x3 = MaxPool3dTFPadding(
            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')
        # conv conv
        conv3d_2b_1x1 = Unit3Dpy(
            out_channels=64,
            in_channels=64,
            kernel_size=(1, 1, 1),
            padding='SAME')
        self.conv3d_2b_1x1 = conv3d_2b_1x1
        conv3d_2c_3x3 = Unit3Dpy(
            out_channels=192,
            in_channels=64,
            kernel_size=(3, 3, 3),
            padding='SAME')
        self.conv3d_2c_3x3 = conv3d_2c_3x3
        self.maxPool3d_3a_3x3 = MaxPool3dTFPadding(
            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding='SAME')

        # Mixed_3b
        self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32])
        self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64])

        self.maxPool3d_4a_3x3 = MaxPool3dTFPadding(
            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding='SAME')

        # Mixed 4
        self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64])
        self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64])
        self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64])
        self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64])
        self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128])

        self.maxPool3d_5a_2x2 = MaxPool3dTFPadding(
            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding='SAME')

        # Mixed 5
        self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128])
        self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128])

        self.avg_pool = torch.nn.AvgPool3d((2, 7, 7), (1, 1, 1))
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.conv3d_0c_1x1 = Unit3Dpy(
            in_channels=1024,
            out_channels=self.num_classes,
            kernel_size=(1, 1, 1),
            activation=None,
            use_bias=True,
            use_bn=False)
        self.softmax = torch.nn.Softmax(1)

    def forward(self, inp):
        # Preprocessing
        #print("Input shape: {}".format(inp.size()))
        out = self.conv3d_1a_7x7(inp)

        #print("Shape after out = self.conv3d_1a_7x7(inp): {}".format(out.size()))
        out = self.maxPool3d_2a_3x3(out)
        out = self.conv3d_2b_1x1(out)
        out = self.conv3d_2c_3x3(out)
        out = self.maxPool3d_3a_3x3(out)
        out = self.mixed_3b(out)
        out = self.mixed_3c(out)
        out = self.maxPool3d_4a_3x3(out)
        #print("Shape after out = self.maxPool3d_4a_3x3(inp): {}".format(out.size()))
        out = self.mixed_4b(out)
        out = self.mixed_4c(out)
        out = self.mixed_4d(out)
        out = self.mixed_4e(out)
        out = self.mixed_4f(out)
        out = self.maxPool3d_5a_2x2(out)
        out = self.mixed_5b(out)
        out = self.mixed_5c(out)
        #print("Shape after out = self.mixed_5c(out): {}".format(out.size()))
        out = self.avg_pool(out)
        #print("Shape after self.avg_pool(out: {}".format(out.size()))

        out = self.dropout(out)
        out = self.conv3d_0c_1x1(out)
        #
        #print("Shape after out = self.conv3d_0c_1x1(out)): {}".format(out.size()))

        out = out.squeeze(3)
        out = out.squeeze(3)
        #print ("Shaper after out = out.squeeze(3): {}".format(out.size()))
        out = out.mean(2)
        #print("Shaper after out = out.mean(2): {}".format(out.size()))

        out_logits = out
        out = self.softmax(out_logits)
        #return out, out_logits
        return out

    def load_tf_weights(self, sess):
        state_dict = {}
        if self.modality == 'rgb':
            prefix = 'RGB/inception_i3d'
        elif self.modality == 'flow':
            prefix = 'Flow/inception_i3d'
        load_conv3d(state_dict, 'conv3d_1a_7x7', sess,
                    os.path.join(prefix, 'Conv3d_1a_7x7'))
        load_conv3d(state_dict, 'conv3d_2b_1x1', sess,
                    os.path.join(prefix, 'Conv3d_2b_1x1'))
        load_conv3d(state_dict, 'conv3d_2c_3x3', sess,
                    os.path.join(prefix, 'Conv3d_2c_3x3'))

        load_mixed(state_dict, 'mixed_3b', sess,
                   os.path.join(prefix, 'Mixed_3b'))
        load_mixed(state_dict, 'mixed_3c', sess,
                   os.path.join(prefix, 'Mixed_3c'))
        load_mixed(state_dict, 'mixed_4b', sess,
                   os.path.join(prefix, 'Mixed_4b'))
        load_mixed(state_dict, 'mixed_4c', sess,
                   os.path.join(prefix, 'Mixed_4c'))
        load_mixed(state_dict, 'mixed_4d', sess,
                   os.path.join(prefix, 'Mixed_4d'))
        load_mixed(state_dict, 'mixed_4e', sess,
                   os.path.join(prefix, 'Mixed_4e'))
        # Here goest to 0.1 max error with tf
        load_mixed(state_dict, 'mixed_4f', sess,
                   os.path.join(prefix, 'Mixed_4f'))

        load_mixed(
            state_dict,
            'mixed_5b',
            sess,
            os.path.join(prefix, 'Mixed_5b'),
            fix_typo=True)
        load_mixed(state_dict, 'mixed_5c', sess,
                   os.path.join(prefix, 'Mixed_5c'))
        load_conv3d(
            state_dict,
            'conv3d_0c_1x1',
            sess,
            os.path.join(prefix, 'Logits', 'Conv3d_0c_1x1'),
            bias=True,
            bn=False)
        self.load_state_dict(state_dict)


def get_conv_params(sess, name, bias=False):
    # Get conv weights
    conv_weights_tensor = sess.graph.get_tensor_by_name(
        os.path.join(name, 'w:0'))
    if bias:
        conv_bias_tensor = sess.graph.get_tensor_by_name(
            os.path.join(name, 'b:0'))
        conv_bias = sess.run(conv_bias_tensor)
    conv_weights = sess.run(conv_weights_tensor)
    conv_shape = conv_weights.shape

    kernel_shape = conv_shape[0:3]
    in_channels = conv_shape[3]
    out_channels = conv_shape[4]

    conv_op = sess.graph.get_operation_by_name(
        os.path.join(name, 'convolution'))
    padding_name = conv_op.get_attr('padding')
    padding = _get_padding(padding_name, kernel_shape)
    all_strides = conv_op.get_attr('strides')
    strides = all_strides[1:4]
    conv_params = [
        conv_weights, kernel_shape, in_channels, out_channels, strides, padding
    ]
    if bias:
        conv_params.append(conv_bias)
    return conv_params


def get_bn_params(sess, name):
    moving_mean_tensor = sess.graph.get_tensor_by_name(
        os.path.join(name, 'moving_mean:0'))
    moving_var_tensor = sess.graph.get_tensor_by_name(
        os.path.join(name, 'moving_variance:0'))
    beta_tensor = sess.graph.get_tensor_by_name(os.path.join(name, 'beta:0'))
    moving_mean = sess.run(moving_mean_tensor)
    moving_var = sess.run(moving_var_tensor)
    beta = sess.run(beta_tensor)
    return moving_mean, moving_var, beta


def _get_padding(padding_name, conv_shape):
    padding_name = padding_name.decode("utf-8")
    if padding_name == "VALID":
        return [0, 0]
    elif padding_name == "SAME":
        #return [math.ceil(int(conv_shape[0])/2), math.ceil(int(conv_shape[1])/2)]
        return [
            math.floor(int(conv_shape[0]) / 2),
            math.floor(int(conv_shape[1]) / 2),
            math.floor(int(conv_shape[2]) / 2)
        ]
    else:
        raise ValueError('Invalid padding name ' + padding_name)


def load_conv3d(state_dict, name_pt, sess, name_tf, bias=False, bn=True):
    # Transfer convolution params
    conv_name_tf = os.path.join(name_tf, 'conv_3d')
    conv_params = get_conv_params(sess, conv_name_tf, bias=bias)
    if bias:
        conv_weights, kernel_shape, in_channels, out_channels, strides, padding, conv_bias = conv_params
    else:
        conv_weights, kernel_shape, in_channels, out_channels, strides, padding = conv_params

    conv_weights_rs = np.transpose(
        conv_weights, (4, 3, 0, 1,
                       2))  # to pt format (out_c, in_c, depth, height, width)
    state_dict[name_pt + '.conv3d.weight'] = torch.from_numpy(conv_weights_rs)
    if bias:
        state_dict[name_pt + '.conv3d.bias'] = torch.from_numpy(conv_bias)

    # Transfer batch norm params
    if bn:
        conv_tf_name = os.path.join(name_tf, 'batch_norm')
        moving_mean, moving_var, beta = get_bn_params(sess, conv_tf_name)

        out_planes = conv_weights_rs.shape[0]
        state_dict[name_pt + '.batch3d.weight'] = torch.ones(out_planes)
        state_dict[name_pt + '.batch3d.bias'] = torch.from_numpy(beta)
        state_dict[name_pt
                   + '.batch3d.running_mean'] = torch.from_numpy(moving_mean)
        state_dict[name_pt
                   + '.batch3d.running_var'] = torch.from_numpy(moving_var)


def load_mixed(state_dict, name_pt, sess, name_tf, fix_typo=False):
    # Branch 0
    load_conv3d(state_dict, name_pt + '.branch_0', sess,
                os.path.join(name_tf, 'Branch_0/Conv3d_0a_1x1'))

    # Branch .1
    load_conv3d(state_dict, name_pt + '.branch_1.0', sess,
                os.path.join(name_tf, 'Branch_1/Conv3d_0a_1x1'))
    load_conv3d(state_dict, name_pt + '.branch_1.1', sess,
                os.path.join(name_tf, 'Branch_1/Conv3d_0b_3x3'))

    # Branch 2
    load_conv3d(state_dict, name_pt + '.branch_2.0', sess,
                os.path.join(name_tf, 'Branch_2/Conv3d_0a_1x1'))
    if fix_typo:
        load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
                    os.path.join(name_tf, 'Branch_2/Conv3d_0a_3x3'))
    else:
        load_conv3d(state_dict, name_pt + '.branch_2.1', sess,
                    os.path.join(name_tf, 'Branch_2/Conv3d_0b_3x3'))

    # Branch 3
    load_conv3d(state_dict, name_pt + '.branch_3.1', sess,
    os.path.join(name_tf, 'Branch_3/Conv3d_0b_1x1'))



In [3]:
import torch
import numpy as np
from random import randint, seed
import math
import os
import PIL
import numbers
import cv2



def add_text_to_image(img, text, font = cv2.FONT_ITALIC, bottomLeftCornerOfText = (10,20), fontScale = 0.4,fontColor = (200,200,200),lineType = 1):

    color = np.random.randint(0, 255, size=(3, ))
    color = ( int (color [ 0 ]), int (color [ 1 ]), int (color [ 2 ]))
    img_with_text = cv2.putText(img, text,
                bottomLeftCornerOfText,
                font,
                fontScale,
                color,
                lineType)
    return img_with_text




def extractFilesFromDirWhichMatchList(folder, string_match_filenames, string_not_match_filenames=[], full_path=False):
    result_files = []
    print (folder)
    for (dirpath, dirnames, filenames) in os.walk(folder):
        for filename in filenames:
            ok = True

            if full_path:

                check = os.path.join(dirpath, filename)
                # Only include if matches
                for s in string_match_filenames:
                    if (check.find(s) == -1):  # if not found
                        ok = False

                # Exclude if matches
                for s in string_not_match_filenames:
                    if (check.find(s) != -1):  # if found
                        ok = False

            else:
                # Only include if matches
                for s in string_match_filenames:
                    if (filename.find(s) == -1):  # if not found
                        ok = False

                # Exclude if matches
                for s in string_not_match_filenames:
                    if (filename.find(s) != -1):  # if found
                        ok = False

            if ok:
                file_path = os.path.join(dirpath, filename)
                result_files.append(file_path)

    return result_files


def resize_clip(clip, size, interpolation='bilinear'):
    if isinstance(clip[0], np.ndarray):
        if isinstance(size, numbers.Number):
            im_h, im_w, im_c = clip[0].shape
            # Min spatial dim already matches minimal size
            if (im_w <= im_h and im_w == size) or (im_h <= im_w
                                                   and im_h == size):
                return clip
            new_h, new_w = get_resize_sizes(im_h, im_w, size)
            size = (new_w, new_h)
        else:
            size = size[1], size[0]
        if interpolation == 'bilinear':
            np_inter = cv2.INTER_LINEAR
        else:
            np_inter = cv2.INTER_NEAREST
        scaled = [
            cv2.resize(img, size, interpolation=np_inter) for img in clip
        ]
    elif isinstance(clip[0], PIL.Image.Image):
        if isinstance(size, numbers.Number):
            im_w, im_h = clip[0].size
            # Min spatial dim already matches minimal size
            if (im_w <= im_h and im_w == size) or (im_h <= im_w
                                                   and im_h == size):
                return clip
            new_h, new_w = get_resize_sizes(im_h, im_w, size)
            size = (new_w, new_h)
        else:
            size = size[1], size[0]
        if interpolation == 'bilinear':
            pil_inter = PIL.Image.BILINEAR
        else:
            pil_inter = PIL.Image.NEAREST
        scaled = [img.resize(size, pil_inter) for img in clip]
    else:
        raise TypeError('Expected numpy.ndarray or PIL.Image' +
                        'but got list of {0}'.format(type(clip[0])))
    return np.asarray(scaled)


def get_resize_sizes(im_h, im_w, size):
    if im_w < im_h:
        ow = size
        oh = int(size * im_h / im_w)
    else:
        oh = size
        ow = int(size * im_w / im_h)
    return oh, ow

def normalize_color_input_zero_center_unit_range(frames, max_val = 255.0):

    frames = (frames / max_val) * 2 - 1
    return(frames)


class normalizeColorInputZeroCenterUnitRange(object):
    def __init__(self, max_val = 255.0):

        self.max_val = max_val


    def __call__(self, input_tensor):
        result = normalize_color_input_zero_center_unit_range(input_tensor, max_val = self.max_val)

        return result


def random_select(frames, n, seed = None):
    """
    Takes multiple frames as ndarray with shape
    (frame id, height, width, channels) and selects
    randomly n-frames. If n is greater than the number
    of overall frames, placeholder frames (zeros) will
    be added.

    frames: numpy
        all frames (e.g. video) with shape
        (frame id, height, width, channels)
    n: int
        number of desired randomly picked frames

    Returns
    -------
    Numpy: frames
        randomly picked frames with shape
        (frame id, height, width, channels)
    """
    #print("Frames shape:{}".format(np.shape(frames)))
    if seed is not None:
        seed(seed)

    number_of_frames = np.shape(frames)[0]
    if number_of_frames < n:
        # Add all frames
        selected_frames = []
        for i in range(number_of_frames):
            frame = frames[i, :, :, :]
            selected_frames.append(frame)

        # Fill up with 'placeholder' images
        frame = np.zeros(frames[0, :, :, :].shape)
        for i in range(n - number_of_frames):
            selected_frames.append(frame)

        return np.array(selected_frames)

    # Selected random frame ids
    frame_ids = set([])
    while len(frame_ids) < n:
        frame_ids.add(randint(0, number_of_frames - 1))

    # Sort the frame ids
    frame_ids = sorted(frame_ids)

    # Select frames
    selected_frames = []
    for id in frame_ids:
        #print (np.shape(frames))

        frame = frames[id, :, :, :]
        selected_frames.append(frame)

    return np.array(selected_frames)


def center_crop(frames, height, width, pad_zeros_if_too_small = True):
    """
    Takes multiple frames as ndarray with shape
    (frame id, height, width, channels) and crops all
    frames centered to desired width and height.

    frames: numpy
        all frames (e.g. video) with shape
        (frame id, height, width, channels)
    height: int
        height of the resulting crop
    width: int
        width of the resulting crop

    Returns
    -------
    Numpy: frames
        centered cropped frames with shape
        (frame id, height, width, channels)
    """

    frame_height = np.shape(frames)[1]
    frame_width = np.shape(frames)[2]

    t = np.shape(frames)[0]
    channels = np.shape(frames)[3]

    if pad_zeros_if_too_small and (height > frame_height or width > frame_width):
        # desired width
        frames_new = np.zeros((t, max(frame_height, height), max(frame_width, width), channels))
        # fill with the old data
        frames_new[0:t, 0:frame_height, 0:frame_width, 0:channels] = frames
        frames = frames_new
        frame_height = np.shape(frames)[1]
        frame_width = np.shape(frames)[2]


    origin_x = (frame_width - width) / 2
    origin_y = (frame_height - height) / 2

    # Floor origin (miss matching input sizes)
    # E.g. input width of 171 and crop width 112
    # would result in a float.
    origin_x = math.floor(origin_x)
    origin_y = math.floor(origin_y)

    return frames[:,
                  origin_y: origin_y + height,
                  origin_x: origin_x + width,
                  :]



class Rescale(object):
    def __init__(self, size, interpolation='bilinear'):
        self.size = size
        self.interpolation = interpolation

    def __call__(self, clip):

        resized = resize_clip(
            clip, self.size, interpolation=self.interpolation)
        return resized


class CenterCrop(object):
    def __init__(self, height, width):
        self.height = height
        self.width = width

    def __call__(self, input_tensor):

        result = center_crop(input_tensor, self.height, self.width)

        return result




class RandomSelect(object):
    def __init__(self, n):
        self.n = n

    def __call__(self, input_tensor):

        result = random_select(input_tensor, self.n)

        return result



class ToTensor(object):
    def __call__(self, input_tensor):


        # Swap color channels axis because
        # numpy frames: Frames ID x Height x Width x Channels
        # torch frames: Channels x Frame ID x Height x Width

        result = input_tensor.transpose(3, 0, 1, 2)
        result = np.float32(result)

        return torch.from_numpy(result)




In [4]:
import cv2
import numpy as np
import torch
import torchvision
import pickle as pkl
from torch.autograd import Variable
import os
from collections import OrderedDict

"""
Preforms a single inference forward pass given a video chunk (array of images)

Arguments:
    images: numpy array of images (video chunk), dimensionality TxHxWxC, first dimension is the time
    network: a pytorch model
    annotation_converter: an array of strings for mapping predicted IDs to class names (see load_model function, which reads the annotation converter)
    cuda_active (optional): default - True ( the model is moved to the GPU.)
    reset_transform (torchvision.transform, optional): default - None. Resets the default transform with the specified reset_transform, if it is not None.

Returns: top1_class, top1_class_conf, all_class_conf
    top1_class (str): predicted top1 class
    top1_class_conf (float): confidence of the predicted top1 class
    all_class_conf (dict): confidences for all classes as a dict (key is the class name, value is the confidence)

"""

def run_inference_on_video_chunk(images, network, annotation_converter, cuda_active=True, reset_transform=None):
    if (reset_transform):
        transform = reset_transform
    else:  # default transform
        transform = torchvision.transforms.Compose([
            Rescale(size=(252, 256)), #Resize frames
            RandomSelect(n=32), #Randomly select n=32 frames if more frames provided
            CenterCrop(height=224, width=224), #Crop frames to (224, 224)
            normalizeColorInputZeroCenterUnitRange(), #Scale pixel values to the range [-1, 1]
            ToTensor() #Convert the NumPy array to a PyTorch tensor
        ])

    images_transformed = transform(np.asarray(images)) #Converts the input images to a NumPy array and applies the preprocessing pipeline
    images_transformed = images_transformed.unsqueeze(0) #Shape after unsqueeze(0): (1, C, T, H, W)

    if cuda_active:
        images_transformed = Variable(images_transformed.cuda())
        outputs = network(images_transformed).cuda()  #pass the transformed input to the model
        outputs = np.squeeze(outputs.data.cpu().numpy())
    else:
        images_transformed = Variable(images_transformed)
        outputs = network(images_transformed)
        outputs = np.squeeze(outputs.data.numpy()) #Converts predictions to a NumPy array and removes extra dimensions.

    out_class_id = np.argmax(outputs)   # Index of the class with the highest score
    top1_class_conf = np.max(outputs)   # Confidence of the highest score

    top1_class = annotation_converter[out_class_id] #Convert the out_class_id to a human-readable activity using the annotation_converter.
    all_class_conf = dict(zip(annotation_converter, outputs)) #dictionary mapping all class indices to their confidence scores

    return top1_class, top1_class_conf, all_class_conf


"""
Loads n_frames from video file at filepath as a numpy array, starting at start_frame and returns it as a numpy array.
n_frames is 0 if all the frames should be taken
Note, the video segment should not be too large (be careful with the default parameters, when n_frames = 0, meanining all frames are loaded into memory).
"""


def load_video_segment(filepath, start_frame=0, n_frames=0, visualize=False, waitKey=100):
    cap = cv2.VideoCapture(filepath)

    if (start_frame > 0):
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame);

    # Interate over frames in video
    images = []

    count = 0
    if (n_frames > 0):
        video_length = n_frames
    else:
        video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - start_frame - 1

    while cap.isOpened():
        # Extract the frame
        ret, image = cap.read()
        images.append(image)

        if (visualize):
            cv2.imshow('Video Frame ', image)
            cv2.waitKey(waitKey)

        count = count + 1
        # If there are no more frames left
        if (count > video_length - 1):
            cap.release()
            break
        # print(count)

    return (images)


"""
Iterate through a video file frame by frame, compute activity predictions and visualize them in the video
Maintains buffer frames, uses pretrained I3D model to predict activities for the buffered frames.
Anotates the frames.
Arguments:
    filepath: path for the video file
    network: a pytorch model
    annotation_converter: an array of strings for mapping predicted IDs to class names (see load_model function, whichreads the annotation converter)
    start_frame (default 0): first frame of the video segment, for which the predictions should be computed
    n_frames (default 0, depicting the complete video): number of frames, for which the predictions should be computed (0 if the complete video should be used)
    visualize : default - False. Whether the video with the prediciotn should be visualized using opencv
    waitKey : default - 100. Parameter for visualization. 100 means, visualization with 100 ms pause between the frames
    buffer_size : default - 32.  Size of the stored frame buffer. The prediction is done on the chunk of this size. This means, if the size is 32, the network prediction is computed from the last 32 frames.
    cuda_active : default - True
    frequency : default - 1. Number of frames, after which a new prediction is computed. 1 means, a prediction is done after every frame.
    video_path_out: default - None. If None, a the original video is re-written together with the prediction to the video_path_out
    out_fps: default - 15, output fps
    vidwriter: None -  optionally, one can already provide a vidwriter for the output video. If None, and video_path_out is not None, a new one is created.

"""


# n_frames is 0 if all the frames should be taken
def interate_video_and_predict(filepath, network, annotation_converter, start_frame=0, n_frames=0, visualize=False,
                               waitKey=100,
                               buffer_size=32, cuda_active=True, frequency=1, video_path_out=None, out_fps=15,
                               vidwriter=None):
    cap = cv2.VideoCapture(filepath)  #Opens the video file using OpenCV

    if (start_frame > 0):  #If start_frame is specified, skips frames until the start_frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame);

    # Interate over frames in video
    images = []   #A buffer to store frames

    count = 0
    if (n_frames > 0):
        video_length = n_frames         #If n_frames is provided, processes exactly n_frames frames
    else:
        video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - start_frame - 1  #processes all frames from start_frame to the end of the video
    if (video_path_out):
        os.makedirs(os.path.dirname(video_path_out), exist_ok=True)
    while cap.isOpened(): # Extract the frame
        ret, image = cap.read()

        images.append(image)  #Adds the current frame to the images buffer
        if len(images) > buffer_size:
            images = images[len(images) - buffer_size:len(images)]  #Trims the buffer to ensure it contains at most buffer_size frames.

        if count % frequency == 0: # (frequency = 1, predictions are made for every frame)

            top1_class, top1_class_conf, all_class_conf = run_inference_on_video_chunk(images, network,
                                                                                       annotation_converter,                                                                           cuda_active=cuda_active)

            top1_class_conf_percent = round(100 * top1_class_conf, 2)

            fontColor = (60 + 100 - top1_class_conf_percent, 1.8 * top1_class_conf_percent, 0)

            #Annotate the Frame
            image_without_text = image
            image = add_text_to_image(image, "Frame nr: {}".format(count),
                                                          bottomLeftCornerOfText=(10, 20), fontScale=0.85)
            image = add_text_to_image(image, "{}%".format(top1_class_conf_percent),
                                                          bottomLeftCornerOfText=(10, np.shape(image)[0] - 35),
                                                          fontColor=fontColor, lineType=1, fontScale=0.85,
                                                          font=cv2.FONT_HERSHEY_DUPLEX)
            image = add_text_to_image(image, "{}".format(top1_class),
                                                          bottomLeftCornerOfText=(10, np.shape(image)[0] - 15),
                                                          fontColor=fontColor,
                                                          lineType=1, fontScale=0.85, font=cv2.FONT_HERSHEY_DUPLEX)
            #Display the Frame (Optional)
            if (visualize):
                cv2.imshow('Video Frame ', image)
                cv2.waitKey(waitKey)
            #Save the Frame to Output Video
            if (video_path_out):

                if vidwriter is None:
                    # fourcc = cv2.VideoWriter_fourcc(*'XVID')
                    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
                    vidwriter = cv2.VideoWriter(video_path_out, fourcc, out_fps,
                                                (np.shape(image)[1], np.shape(image)[0]))

                vidwriter.write(image)

        count = count + 1
        print("Current frame number: {}/{}".format(count, n_frames))
        # If there are no more frames left release the resources
        # Stops processing after the specified number of frames (video_length)
        if (count > video_length - 1):

            cap.release()
            if vidwriter is not None:
                vidwriter.release()

            break

def load_model(trained_model_path, annotation_converter_path, cuda_active=True):
    annotation_converter = pkl.load(open(annotation_converter_path, 'rb'))
    # Load the network
    network = I3D(len(annotation_converter), modality='rgb')

    print("Loading trained model: %s" % (trained_model_path))
    if (cuda_active):
        network.load_state_dict(torch.load(trained_model_path))
    else:
        network.load_state_dict(torch.load(trained_model_path, map_location='cpu'))
    print("Loading trained model done. Number of classes: {}".format(len(annotation_converter)))

    if cuda_active:
        network = network.cuda()

    torch.set_grad_enabled(False)
    network.eval()

    return network, annotation_converter


def test_run_inference_on_video_chunk():
    cuda_active = True

    # Load the model
    trained_model_path = "./demo_models/MidLevel/AllActions/view_ids_1/supervised_models/I3D/n_input_frames_32balance_by_sampling_True_affine_True_pretrained_True/2019-03-27-13-10-39/best_model.pth"
    annotation_converter_path = "./demo_models/MidLevel/AllActions/view_ids_1/supervised_models/I3D/n_input_frames_32balance_by_sampling_True_affine_True_pretrained_True/2019-03-27-13-10-39/annotation_converter.pkl"
    network, annotation_converter = load_model(trained_model_path, annotation_converter_path, cuda_active=cuda_active)

    # Load the video chunk (numpy array)
    filepath_video = "/cvhci/data/activity/Pakos/final_dataset/pakos_videos/vp1/run1b_2018-05-29-14-02-47.ids_1.mp4"
    start_frame = 30000
    n_frames = 32
    video_chunk = load_video_segment(filepath_video, start_frame=start_frame, n_frames=n_frames)

    # Make a prediction
    print("Computing prediction for video {}, video chunk from frame {} to frame {}".format(filepath_video, start_frame,
                                                                                            start_frame + n_frames))
    top1_class, top1_class_conf, all_class_conf = run_inference_on_video_chunk(video_chunk, network,
                                                                               annotation_converter,
                                                                               cuda_active=cuda_active)

    # Print all predictions sorted by confidence
    ranks = np.argsort(list(all_class_conf.values()))[::-1]  # extract confidence scores and sort indices in descending order of confidence
    classes_sorted = np.asarray(list(all_class_conf.keys()))[ranks] #Classes sorted by confidence
    conf_sorted = np.asarray(list(all_class_conf.values()))[ranks] #Corresponding confidence scores

    for i in range(len(conf_sorted)):
        print("{}) {} - {}".format(i + 1, classes_sorted[i], conf_sorted[i]))


def test_interate_video_and_predict(filepath_video="./test_data/run1b_2018-05-29-14-02-47.kinect_color.mp4",
                                    video_path_out = "./test_data/output.mp4",
                                    start_frame=100,
                                    n_frames=200,
                                    cuda_active = True):
    """
    Iteratively process and predict over a video using a pre-trained model.
    Improtant: if n_frames = 0 - the whole video is processed!

    Parameters:
    filepath_video (str): Path to the video file to be processed.
    start_frame (int): The frame number from where to start the prediction.
    n_frames (int): The number of frames to process and predict. Improtant: if 0 - the whole video is processed!
    Returns:
    None - Depending on the function's parameters, it may save a video with predictions or display the visualization.
    best_model.pth contains the pretrained weights for the I3D model
    ( weights were on the Drive&Act dataset and were fine-tuned after pretraining on Kinetics)
    """



    # Load the model
    trained_model_path = "./demo_models/best_model.pth"
    annotation_converter_path = "./demo_models/annotation_converter.pkl"
    network, annotation_converter = load_model(trained_model_path, annotation_converter_path, cuda_active=cuda_active)

    #Note: out_fps must be 15 for RGB videos!
    interate_video_and_predict(filepath_video,
                               network=network,
                               annotation_converter=annotation_converter,
                               start_frame=start_frame,
                               n_frames=n_frames, visualize=False, waitKey=100,
                               buffer_size=32, cuda_active=cuda_active,
                               frequency=1, video_path_out=video_path_out,
                               out_fps=15)


In [5]:
import logging
import argparse
import cv2

# Configure logging
def configure_logger(log_file):
    """
    Configures the logger to save predictions and details to a log file.
    """
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(message)s",
        handlers=[
            logging.FileHandler(log_file, mode='w'),  # Save logs to file
        ]
    )

def log_prediction(frame_idx, top1_class, confidence):
    """
    Logs the prediction for a given frame.

    Args:
        frame_idx (int): Index of the frame.
        top1_class (str): Predicted activity.
        confidence (float): Confidence of the prediction.
    """
    logging.info(f"Frame {frame_idx}: Predicted Activity: {top1_class}, Confidence: {confidence:.2f}%")

def overlay_prediction(frame, prediction, confidence):
    font = cv2.FONT_HERSHEY_SIMPLEX
    text = f"Prediction: {prediction} ({confidence:.2f}%)"
    cv2.putText(frame, text, (10, 30), font, 0.8, (0, 255, 0), 2, cv2.LINE_AA)
    return frame

def sliding_window_inference(video_path, model_path, annotation_path, log_file, window_size=16, stride=1, resize=(224, 224), cuda_active=True):
    # Configure logger
    print("Configuring logger...")
    configure_logger(log_file)

    print("Loading model...")
    model, annotation_converter = load_model(model_path, annotation_path, cuda_active)
    print("Model loaded successfully.")

    print(f"Loading video from path: {video_path}")
    frames = load_video_segment(video_path, start_frame=0, n_frames=0)
    print(f"Number of frames loaded: {len(frames)}")
    if not frames:
        raise ValueError("No frames were loaded from the video. Check the video file or path.")

    num_frames = len(frames)

    # Create a video writer for output
    output_path = "./annotated_output.mp4"
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_fps = 15
    height, width, _ = frames[0].shape
    video_writer = cv2.VideoWriter(output_path, fourcc, out_fps, (width, height))

    # Initialize a list to store predictions for each frame
    frame_predictions = [None] * num_frames

    print("Starting sliding window inference...")

    # Sliding window inference
    for start in range(0, num_frames - window_size + 1, stride):
        print(f"Processing window: Frames {start} to {start + window_size - 1}")
        # Define window
        end = start + window_size
        window_frames = frames[start:end]

        resized_window = [cv2.resize(frame, resize) for frame in window_frames]
        # Run inference on the resized window
        top1_class, top1_class_conf, _ = run_inference_on_video_chunk(resized_window, model, annotation_converter, cuda_active)
        print(f"Window Prediction: {top1_class} ({top1_class_conf * 100:.2f}%)")

        # Store predictions for all frames in the window
        for i in range(start, end):
            if frame_predictions[i] is None:
                frame_predictions[i] = [(top1_class, top1_class_conf)]
            else:
                frame_predictions[i].append((top1_class, top1_class_conf))

    print("Combining predictions for overlapping frames...")
    logs = []
    # Combine predictions for overlapping frames
    for i, predictions in enumerate(frame_predictions):
        if predictions:
            # Select the prediction with the highest confidence
            top_prediction = max(predictions, key=lambda x: x[1])
            top1_class, top1_class_conf = top_prediction

            # # Overlay prediction on the frame
            # frame_with_overlay = overlay_prediction(frames[i], top1_class, top1_class_conf * 100)

            # # Write to output video
            # video_writer.write(frame_with_overlay)

            # # Display the frame with overlay (real-time playback)
            # cv2.imshow("Video with Predictions", frame_with_overlay)
            # if cv2.waitKey(1) & 0xFF == ord('q'):  # Press 'q' to quit early
            #     video_writer.release()
            #     cv2.destroyAllWindows()
            #     return

            # Log predictions
            logs.append(f"Frame {i}: Predicted Activity: {top1_class}, Confidence: {top1_class_conf * 100:.2f}%")

        if i % 50 == 0:
            print(f"Processed frame {i}/{num_frames}")
    with open(log_file, "w") as f:
        f.writelines(line + "\n" for line in logs)
    # Release resources
    video_writer.release()
    cv2.destroyAllWindows()
    logging.info("Inference complete. Predictions logged successfully.")

In [6]:
import pandas as pd
import argparse

class Metric:
    """Utility class for different metrics"""

    def __init__(self, ground_truth_csv, prediction_log, file_id):
        self.ground_truth = pd.read_csv(ground_truth_csv)
        self.predictions = self.parse_predictions(prediction_log)
        self.segmented_predictions = self.convert_predictions_to_segments(self.predictions)
        self.file_id = file_id
        self.activity_classes = list(self.ground_truth['activity'].dropna().unique())
        self.filtered_ground_truth = self.ground_truth[self.ground_truth['file_id'] == file_id]

    @staticmethod
    def parse_predictions(log_file):
        predictions = []
        with open(log_file, 'r') as file:
            for line in file:
                try:
                    # Split the line to extract frame and activity information
                    parts = line.strip().split(' - ')
                    frame_part = parts[1].split(': ')[0]
                    frame = int(frame_part.split()[1])

                    # Extract activity and confidence
                    activity_part = parts[1].split(': ', maxsplit=1)[1]
                    activity = activity_part.split(', Confidence')[0].replace("Predicted Activity: ", "").strip()

                    # Extract confidence if it exists
                    confidence = 0.0  # Default confidence
                    if "Confidence" in activity_part:  # Check if "Confidence" exists in the string
                        confidence = float(activity_part.split('Confidence: ')[1].strip('%'))  # Extract confidence as float

                    # Append to predictions list
                    predictions.append({'frame': frame, 'activity': activity, 'confidence': confidence})

                except (IndexError, ValueError) as e:
                    print(f"Skipping line due to error: {line.strip()} -> {e}")

        # Convert predictions to DataFrame
        return pd.DataFrame(predictions)

    @staticmethod
    def convert_predictions_to_segments(predictions):
        segments = []
        current_activity = None
        current_start = None

        for _, row in predictions.iterrows():
            frame = row['frame']
            activity = row['activity']

            # Start a new segment if the activity changes
            if activity != current_activity:
                if current_activity is not None:
                    # Save the previous segment
                    segments.append({
                        'frame_start': current_start,
                        'frame_end': frame - 1,
                        'activity': current_activity
                    })
                # Start a new segment
                current_activity = activity
                current_start = frame

        # Save the last segment
        if current_activity is not None:
            segments.append({
                'frame_start': current_start,
                'frame_end': predictions.iloc[-1]['frame'],
                'activity': current_activity
            })

        return pd.DataFrame(segments)


    def evaluate_multiclass(self):
        all_activities = set(self.filtered_ground_truth['activity']).union(set(self.segmented_predictions['activity']))

        # Initialize metrics for all known activities
        metrics = {cls: {'tp': 0, 'fp': 0, 'fn': 0} for cls in all_activities}
        matched_chunks = set()

        for _, gt in self.filtered_ground_truth.iterrows():
            gt_start = gt['frame_start']
            gt_end = gt['frame_end']
            gt_activity = gt['activity']
            chunk_key = (gt['annotation_id'], gt['chunk_id'])
            gt_midpoint = (gt_start + gt_end) // 2

            matched_prediction = self.segmented_predictions[
                (self.segmented_predictions['frame_start'] <= gt_midpoint) &
                (self.segmented_predictions['frame_end'] >= gt_midpoint) &
                (self.segmented_predictions['activity'] == gt_activity)
            ]

            if not matched_prediction.empty:
                #  True Positive: A prediction exists for this midpoint
                if chunk_key not in matched_chunks:
                    metrics[gt_activity]['tp'] += 1
                    matched_chunks.add(chunk_key)
                else:
                    metrics[gt_activity]['fp'] += 1
            else:
                # False Negative: No correct prediction found for this activity midpoint
                metrics[gt_activity]['fn'] += 1

        # Count False Positives for unmatched predictions
        for _, pred in self.segmented_predictions.iterrows():
            pred_activity = pred['activity']

            if pred_activity in metrics and metrics[pred_activity]['tp'] > 0:
                continue

            if pred_activity not in metrics:
                continue

            metrics[pred_activity]['fp'] += 1

        precision, recall = {}, {}
        for cls in all_activities:
            tp, fp, fn = metrics[cls]['tp'], metrics[cls]['fp'], metrics[cls]['fn']
            precision[cls] = tp / (tp + fp) * 100 if (tp + fp) > 0 else 0
            recall[cls] = tp / (tp + fn) * 100 if (tp + fn) > 0 else 0

        overall_tp = sum(metrics[cls]['tp'] for cls in all_activities)
        overall_fp = sum(metrics[cls]['fp'] for cls in all_activities)
        overall_fn = sum(metrics[cls]['fn'] for cls in all_activities)

        overall_precision = overall_tp / (overall_tp + overall_fp) * 100 if (overall_tp + overall_fp) > 0 else 0
        overall_recall = overall_tp / (overall_tp + overall_fn) * 100 if (overall_tp + overall_fn) > 0 else 0

        return precision, recall, overall_precision, overall_recall

    def midpoint_hit_criteria(self):
        """For each ground truth activity window, if the predicted activity for the midpoint frame
           matches the ground truth activity, we count it as a "correct hit."

        Returns:
            float: correct_hits / total_instances
        """
        correct_hits = 0
        total_windows = 0

        for _, row in self.ground_truth.iterrows():
            # Get the frame range for the activity
            start_frame = row['frame_start']
            end_frame = row['frame_end']
            activity = row['activity']
            file_id = row['file_id']

            # Calculate midpoint frame
            midpoint_frame = (start_frame + end_frame) // 2

            # Check prediction for the midpoint frame
            if self.file_id == file_id:
                if midpoint_frame in self.predictions:
                    predicted_activity, _ = self.predictions[midpoint_frame]
                    if predicted_activity == activity:
                        correct_hits += 1

                total_windows += 1

        return correct_hits / total_windows if total_windows > 0 else 0.0

    def iou(self):
        """
        Calculate the Intersection over Union (IoU) metric. Here,
        the overlapping part between the ground truth window and the predicted window is intersection and
        the total area covered by both the ground truth and predicted windows is union.

        Returns:
            float: The average IoU score for each ground truth.
        """
        iou_scores = []

        for _, row in self.filtered_ground_truth.iterrows():
            # Get the frame range for the activity
            gt_start = row['frame_start']
            gt_end = row['frame_end']
            activity = row['activity']

            # Find all predicted frames that match the activity
            predicted_frames = self.predictions[
                                    (self.predictions["activity"] == activity) &
                                    (self.predictions["frame"] >= gt_start) &
                                    (self.predictions["frame"] <= gt_end)
                                ]["frame"].tolist()


            if not predicted_frames:
                iou_scores.append(0)
                continue

            # Calculate intersection and union
            pred_start = min(predicted_frames)
            pred_end = max(predicted_frames)

            intersection_start = max(gt_start, pred_start)
            intersection_end = min(gt_end, pred_end)
            intersection = max(0, intersection_end - intersection_start + 1)

            union_start = min(gt_start, pred_start)
            union_end = max(gt_end, pred_end)
            union = max(0, union_end - union_start + 1)

            iou = intersection / union if union > 0 else 0
            iou_scores.append(iou)

        return sum(iou_scores) / len(iou_scores) if iou_scores else 0.0

    def evaluate(self):
        mean_iou = self.iou()
        precision, recall, overall_precision, overall_recall = self.evaluate_multiclass()

        print("Precision per class (%):", {cls: f"{precision[cls]:.2f}%" for cls in precision})
        print("Recall per class (%):", {cls: f"{recall[cls]:.2f}%" for cls in recall})
        print(f"Overall Precision: {overall_precision:.2f}%")
        print(f"Overall Recall: {overall_recall:.2f}%")
        print(f"Mean IoU: {mean_iou*100:.2f}%")

In [None]:
BASE_VIDEO_PATHS=[
    "vp9/run1b_2018-05-23-16-19-17.kinect_color",
    "vp10/run1_2018-05-24-13-14-41.kinect_color",
    "vp10/run2_2018-05-24-14-08-46.kinect_color",
    "vp12/run1_2018-05-24-15-44-28.kinect_color",
    "vp12/run2_2018-05-24-16-21-35.kinect_color",
]


MODEL_PATH="/content/best_model.pth"
ANNOTATION_PATH="/content/annotation_converter.pkl"

GROUND_TRUTH_CSV="/content/midlevel.chunks_90.csv"

WINDOW_SIZES=[8, 16 ,32]

for BASE_VIDEO in BASE_VIDEO_PATHS:
    DATA_FOLDER="/content/data"
    VIDEO_PATH=f"{DATA_FOLDER}/{BASE_VIDEO}.mp4"

    for WINDOW_SIZE in WINDOW_SIZES:
        LOG_FILE=f"{DATA_FOLDER}/{BASE_VIDEO}_predictions_w{WINDOW_SIZE}.log"
        METRICS_OUTPUT=f"{DATA_FOLDER}/{BASE_VIDEO}_metrics_results_w{WINDOW_SIZE}.txt"

        print(f"Processing video: {VIDEO_PATH} with window size {WINDOW_SIZE}")

        sliding_window_inference(video_path=VIDEO_PATH,
                             model_path=MODEL_PATH,
                             annotation_path=ANNOTATION_PATH,
                             log_file=LOG_FILE,
                             window_size=WINDOW_SIZE,  # Use the user-specified window size
                             stride=1,
                             resize=(224, 224),
                             cuda_active=True)