In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
!unzip "/content/gdrive/MyDrive/data(1).zip"


Archive:  /content/gdrive/MyDrive/data(1).zip
replace data/features/P03_cam01_P03_cereals.npy? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [15]:
import torch
from torchvision.io.video import read_video
from PIL import Image
import numpy as np

from os import listdir
from os.path import isfile, join
import torchvision.transforms.functional as tf



class TCNDataset(torch.utils.data.Dataset):
    def __init__(self,path='/content/data' , training = True):
        
        self.path = path
        # load classes mapping
        self.class2index = {} 
        self.index2class = {}
        classes_file = open(path+'/mapping.txt','r')
        for line in classes_file:
            line = line.rstrip('\n') 
            splitted = line.split(' ')
            self.class2index[splitted[1]] = splitted[0]
            self.index2class[splitted[0]] = splitted[1]
        classes_file.close()
        
        # load class names 
        video_list_location = '/train.bundle' if training else '/test.bundle'
        video_list_file = open(path+video_list_location)
        self.video_list = []
        for line in video_list_file:
            line = line.rstrip('\n')
            name = line.split('.txt')[0]
            self.video_list.append(name)
            
        video_list_file.close()
        
        self.size = len(self.video_list)
        self.training = training

    def __len__(self):
        return self.size


    def __getitem__(self, index):
        
        video_name = self.video_list[index]
        features = torch.from_numpy(np.load(self.path+'/features/'+video_name+'.npy'))
        labels_file = open(self.path+'/groundTruth/'+video_name+'.txt')
        labels = torch.zeros((features.shape[1],))
        for idx, line in enumerate(labels_file):
             line = line.rstrip('\n')
             label = self.class2index[line]
             labels[idx] = int(label)
        labels_file.close()
        return features, labels

                

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# From figure 2 in paper
class DilatedResidualLayer(torch.nn.Module):
    def __init__(self, dilation_factor, in_channels, out_channels):
        super(DilatedResidualLayer, self).__init__()
        # padding = dilation_factor to keep size 
        self.block =  nn.Sequential(  nn.Conv1d(in_channels, out_channels, 3, padding=dilation_factor, dilation=dilation_factor),
                                      nn.ReLU(inplace = True),
                                      nn.Conv1d(out_channels, out_channels, 1)
                                      )
    def forward(self,x, mask):
        return (self.block(x) + x )* mask[:, 0:1, :]

    

class TCN(torch.nn.Module):
    def __init__(self, num_layers = 10, num_f_maps=64, dim = 2048, num_classes = 48):
        super(TCN, self).__init__()
        self.conv_1x1 = nn.Conv1d(dim, num_f_maps, 1)
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            # linear increasing of number dilation using i+1
            layer = DilatedResidualLayer(i+1, num_f_maps, num_f_maps)
            self.layers.append(layer)
        self.last_conv = nn.Conv1d(num_f_maps, num_classes, 1)

    def forward(self, x , mask):
        out = self.conv_1x1(x)
        for layer in self.layers:
            out = layer(out,mask)
        out = self.last_conv(out)  * mask[:, 0:1, :]
        return out
    
    
class MultiStageTCN(torch.nn.Module):
    def __init__(self, num_stages = 4, num_layers = 10, num_f_maps=64, dim = 2048, num_classes = 48):
        super(MultiStageTCN, self).__init__()
        self.first_TCN = TCN(num_layers, num_f_maps, dim, num_classes)
        self.TCNs = nn.ModuleList()
        for i in range(num_layers-1):
            tcn =  TCN(num_layers, num_f_maps, num_classes + dim, num_classes)
            self.TCNs.append(tcn)
        self.last_TCN =  TCN(num_layers, num_f_maps, num_classes + dim, num_classes)

    def forward(self, x , mask):
        out = self.first_TCN(x,mask)   
        out = torch.cat((x, out), dim=1)
        #out_wrapped = out.unsqueeze(0)
        for stage in self.TCNs:
            out = stage(out,mask)
            out = F.softmax(out, dim=1)   
            out = torch.cat((x, out), dim=1)
            out = stage( out* mask[:, 0:1, :], mask)
            out = torch.cat((x, out), dim=1)
            #out_wrapped = torch.cat((out_wrapped, out.unsqueeze(0)), dim=0)
        out = self.last_TCN(out,mask)   
        #out_wrapped = torch.cat((out_wrapped, out.unsqueeze(0)), dim=0)
        return out
    

# For question 4, supporting down and upsampling  
class SampledTCN(torch.nn.Module):
    def __init__(self, sampling_factor, num_layers = 10, num_f_maps=64, dim = 2048, num_classes = 48 ):
        super(SampledTCN, self).__init__()
        # we downsamble using convolutions 
        self.downsamble_conv = nn.Conv1d(dim, dim//sampling_factor, 1)
        self.conv_1x1 = nn.Conv1d(dim//sampling_factor, num_f_maps, 1)
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            layer = DilatedResidualLayer(i+1, num_f_maps, num_f_maps)
            self.layers.append(layer)
        self.upsample_conv = nn.Conv1d(num_f_maps, dim, 1)

    def forward(self, x , mask):
        out = self.downsamble_conv(x)
        out = self.conv_1x1(out)
        for layer in self.layers:
            out = layer(out,mask)
        out = self.upsample_conv(out) * mask[:, 0:1, :]
        return out
    
    
    
class ParallelTCNs(torch.nn.Module):
        def __init__(self, num_layers = 10, num_f_maps=64, dim = 2048, num_classes = 48):
            super(ParallelTCNs, self).__init__()
            self.TCN1 = SampledTCN(1, num_layers, num_f_maps, dim, num_classes)
            # scale reduced with factor 4 2048 -> 512
            self.TCN2 = SampledTCN(4, num_layers, num_f_maps, dim, num_classes)
             # scale reduced with factor 8 2048 -> 256
            self.TCN3 = SampledTCN(8, num_layers, num_f_maps, dim, num_classes)
            
            self.prediction_conv1 = nn.Conv1d(dim, num_classes, 1)
            self.prediction_conv2 = nn.Conv1d(dim, num_classes, 1)
            self.prediction_conv3 = nn.Conv1d(dim, num_classes, 1)

            self.prediction_conv_average = nn.Conv1d(dim, num_classes, 1)
            
            
        def forward(self, x, mask):
            out1 = self.TCN1(x,mask)
            out2 = self.TCN2(x,mask)
            out3 = self.TCN3(x,mask)
            
            average_out = torch.mean(torch.stack([out1,out2,out3]) , dim = 0)
            out1 = self.prediction_conv1(out1) * mask[:, 0:1, :]
            out2 = self.prediction_conv2(out2) * mask[:, 0:1, :]
            out3 = self.prediction_conv3(out3) * mask[:, 0:1, :]
            average_out = self.prediction_conv_average(average_out) * mask[:, 0:1, :]
            return out1 , out2 , out3 , average_out
            
            



In [31]:
# coding: utf-8
import numpy as np
import sys
import torch

def get_labels_start_end_time(frame_wise_labels, bg_class=["background"]):
    labels = []
    starts = []
    ends = []
    last_label = frame_wise_labels[0]
    if frame_wise_labels[0] not in bg_class:
        labels.append(frame_wise_labels[0])
        starts.append(0)
    for i in range(len(frame_wise_labels)):
        if frame_wise_labels[i] != last_label:
            if frame_wise_labels[i] not in bg_class:
                labels.append(frame_wise_labels[i])
                starts.append(i)
            if last_label not in bg_class:
                ends.append(i)
            last_label = frame_wise_labels[i]
    if last_label not in bg_class:
        ends.append(i + 1)
    return labels, starts, ends


def levenstein(p, y, norm=False):
    m_row = len(p)    
    n_col = len(y)
    D = np.zeros([m_row+1, n_col+1], np.float64)
    for i in range(m_row+1):
        D[i, 0] = i
    for i in range(n_col+1):
        D[0, i] = i

    for j in range(1, n_col+1):
        for i in range(1, m_row+1):
            if y[j-1] == p[i-1]:
                D[i, j] = D[i-1, j-1]
            else:
                D[i, j] = min(D[i-1, j] + 1,
                              D[i, j-1] + 1,
                              D[i-1, j-1] + 1)
    
    if norm:
        score = (1 - D[-1, -1]/max(m_row, n_col)) * 100
    else:
        score = D[-1, -1]

    return score


def edit_score(recognized, ground_truth, norm=True, bg_class=["background"]):
    P, _, _ = get_labels_start_end_time(recognized, bg_class)
    Y, _, _ = get_labels_start_end_time(ground_truth, bg_class)
    return levenstein(P, Y, norm)


def f_score(recognized, ground_truth, overlap, bg_class=["background"]):
    p_label, p_start, p_end = get_labels_start_end_time(recognized, bg_class)
    y_label, y_start, y_end = get_labels_start_end_time(ground_truth, bg_class)

    tp = 0
    fp = 0

    hits = np.zeros(len(y_label))

    for j in range(len(p_label)):
        intersection = np.minimum(p_end[j], y_end) - np.maximum(p_start[j], y_start)
        union = np.maximum(p_end[j], y_end) - np.minimum(p_start[j], y_start)
        IoU = (1.0*intersection / union)*([p_label[j] == y_label[x] for x in range(len(y_label))])
        # Get the best scoring segment
        idx = np.array(IoU).argmax()

        if IoU[idx] >= overlap and not hits[idx]:
            tp += 1
            hits[idx] = 1
        else:
            fp += 1
    fn = len(y_label) - sum(hits)
    return float(tp), float(fp), float(fn)




def read_file(path):
    with open(path, 'r') as f:
        content = f.read()
        f.close()
    return content


def get_results(model,features, labels, masks): # Input  = features, labels and masks for 1 image
    _,_,_,out = model(features,masks)
    out_hot = torch.max(out,1).indices #?? Should we apply softmax first?
    recog_content = out_hot.cpu().numpy()
    recog_content = recog_content.reshape(recog_content.shape[1])
    gt_content = labels.cpu().numpy()
    gt_content = gt_content.reshape(gt_content.shape[1])
    return(gt_content,recog_content)




def eval_(model, test_dataloader):# Batch size 1
    model.eval()
    recog_path = sys.argv[1] # pass the path of the directory that contains your predictions as a command line parameter
    ground_truth_path = "/content/data/groundTruth/"
    file_list = "/content/data/test.bundle"

    list_of_videos = read_file(file_list).split('\n')[:-1]
    
    overlap = [.1, .25, .5]
    tp, fp, fn = np.zeros(3), np.zeros(3), np.zeros(3)

    correct = 0
    total = 0
    edit = 0
    
    for features, labels, masks in test_dataloader:
        features, labels, masks = features.cuda(), labels.cuda(), masks.cuda()
        gt_content,recog_content = get_results(model,features, labels, masks)
        
        for i in range(len(gt_content)):
            total += 1
            if gt_content[i] == recog_content[i]:
                correct += 1
        
        edit += edit_score(recog_content, gt_content)

        for s in range(len(overlap)):
            tp1, fp1, fn1 = f_score(recog_content, gt_content, overlap[s])
            tp[s] += tp1
            fp[s] += fp1
            fn[s] += fn1

    print ("Acc: %.4f" % (100*float(correct)/total))
    print ('Edit: %.4f' % ((1.0*edit)/len(list_of_videos)))

    for s in range(len(overlap)):
        precision = tp[s] / float(tp[s]+fp[s])
        recall = tp[s] / float(tp[s]+fn[s])

        f1 = 2.0 * (precision*recall) / (precision+recall)

        f1 = np.nan_to_num(f1)*100
        print ('F1@%0.2f: %.4f' % (overlap[s], f1))



In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

    
def train_parallel(model, dataloader,optimizer):
    model.train()
    i=0
    criterion = nn.CrossEntropyLoss()
    running_loss = 0 
    for features, labels, masks in dataloader:
        features , labels , masks = features.cuda() , labels.cuda() , masks.cuda()
        out1, out2, out3 ,out_average = model(features,masks)
        optimizer.zero_grad()
        loss1 = criterion(out1, labels)
        loss2 = criterion(out2, labels)
        loss3 = criterion(out3, labels)
        loss_average = criterion(out_average, labels)
        loss = loss1 + loss2 + loss3 + loss_average
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
                print("    Batch {}: combined loss = {} , average loss: {}".format(i ,loss.item(), loss.item()/4 ))
        i += 1
        running_loss = loss.item()
    return running_loss / len(dataloader)



# function for zero padding for dataloader because of variable video length
# inspired by the code from the paper
def collate_fn_padd(batch):
        batch_input , batch_target = [list(t) for t in zip(*batch)] 
        length_of_sequences = list(map(len, batch_target))
        batch_input_tensor = torch.zeros(len(batch_input), np.shape(batch_input[0])[0], max(length_of_sequences), dtype=torch.float)
        
        batch_target_tensor = torch.ones(len(batch_input), max(length_of_sequences), dtype=torch.long)*(-100)
        
        mask = torch.zeros(len(batch_input), num_classes, max(length_of_sequences), dtype=torch.float)
        
        for i in range(len(batch_input)):
            batch_input_tensor[i, :, :np.shape(batch_input[i])[1]] = batch_input[i]
            
            batch_target_tensor[i, :np.shape(batch_target[i])[0]] = batch_target[i]
            
            mask[i, :, :np.shape(batch_target[i])[0]] = torch.ones(num_classes, batch_target[i].shape[0])
            
        return batch_input_tensor, batch_target_tensor, mask
            
            

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 4
epochs = 50
num_classes = 48


# DATA LOADERS 

training_dataset = TCNDataset(training=True)
training_dataloader = torch.utils.data.DataLoader(training_dataset,collate_fn=collate_fn_padd,  batch_size=batch_size, shuffle=True, drop_last=False)

In [19]:
parallel_TCNs = ParallelTCNs().cuda()
parallel_TCNs_optimizer = torch.optim.Adam(parallel_TCNs.parameters(),lr=0.001)

In [20]:
for epoch in range(epochs):
    print("RUNNING EPOCH: {}".format(epoch+1))
    train_parallel(parallel_TCNs,training_dataloader , parallel_TCNs_optimizer )

    torch.save(parallel_TCNs, "./parallel_model_after_epoch_{}".format(epoch))

RUNNING EPOCH: 1
    Batch 0: combined loss = 15.486039161682129 , average loss: 3.8715097904205322
    Batch 10: combined loss = 13.135170936584473 , average loss: 3.283792734146118
    Batch 20: combined loss = 13.925765991210938 , average loss: 3.4814414978027344
    Batch 30: combined loss = 11.123424530029297 , average loss: 2.780856132507324
    Batch 40: combined loss = 9.680692672729492 , average loss: 2.420173168182373
    Batch 50: combined loss = 10.125143051147461 , average loss: 2.5312857627868652
    Batch 60: combined loss = 9.382369041442871 , average loss: 2.3455922603607178
    Batch 70: combined loss = 9.931092262268066 , average loss: 2.4827730655670166
    Batch 80: combined loss = 10.402716636657715 , average loss: 2.6006791591644287
    Batch 90: combined loss = 13.472755432128906 , average loss: 3.3681888580322266
    Batch 100: combined loss = 7.836481094360352 , average loss: 1.959120273590088
    Batch 110: combined loss = 9.887517929077148 , average loss: 2.

In [21]:
torch.save(parallel_TCNs, "./parallel_tcn")

# Evaluation



In [23]:
test_dataset = TCNDataset(training=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset,collate_fn=collate_fn_padd,  batch_size=1, shuffle=False, drop_last=False)

In [32]:
eval_(parallel_TCNs,test_dataloader)

Acc: 53.2346
Edit: 5.6930
F1@0.10: 4.2908
F1@0.25: 3.4816
F1@0.50: 2.0822
