**Core modules**

In [1]:
"""
CONFIG
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import yaml
from easydict import EasyDict as edict

config = edict()

config.WORKERS = 16
config.LOG_DIR = ''
config.MODEL_DIR = ''
config.RESULT_DIR = ''
config.DATA_DIR = ''
config.VERBOSE = False
config.TAG = ''

# CUDNN related params
config.CUDNN = edict()
config.CUDNN.BENCHMARK = True
config.CUDNN.DETERMINISTIC = False
config.CUDNN.ENABLED = True

# TAN related params
config.TAN = edict()
config.TAN.FRAME_MODULE = edict()
config.TAN.FRAME_MODULE.NAME = ''
config.TAN.FRAME_MODULE.PARAMS = None
config.TAN.PROP_MODULE = edict()
config.TAN.PROP_MODULE.NAME = ''
config.TAN.PROP_MODULE.PARAMS = None
config.TAN.FUSION_MODULE = edict()
config.TAN.FUSION_MODULE.NAME = ''
config.TAN.FUSION_MODULE.PARAMS = None
config.TAN.MAP_MODULE = edict()
config.TAN.MAP_MODULE.NAME = ''
config.TAN.MAP_MODULE.PARAMS = None
config.TAN.PRED_INPUT_SIZE = 512

# common params for NETWORK
config.MODEL = edict()
config.MODEL.NAME = ''
config.MODEL.CHECKPOINT = '' # The checkpoint for the best performance

# DATASET related params
config.DATASET = edict()
config.DATASET.ROOT = ''
config.DATASET.NAME = ''
config.DATASET.MODALITY = ''
config.DATASET.VIS_INPUT_TYPE = ''
config.DATASET.NO_VAL = False
config.DATASET.BIAS = 0
config.DATASET.NUM_SAMPLE_CLIPS = 256
config.DATASET.TARGET_STRIDE = 16
config.DATASET.DOWNSAMPLING_STRIDE = 16
config.DATASET.SPLIT = ''
config.DATASET.NORMALIZE = False
config.DATASET.RANDOM_SAMPLING = False

# train
config.TRAIN = edict()
config.TRAIN.LR = 0.001
config.TRAIN.WEIGHT_DECAY = 0
config.TRAIN.FACTOR = 0.8
config.TRAIN.PATIENCE = 20
config.TRAIN.MAX_EPOCH = 20
config.TRAIN.BATCH_SIZE = 4
config.TRAIN.SHUFFLE = True
config.TRAIN.CONTINUE = False

config.LOSS = edict()
config.LOSS.NAME = 'bce_loss'
config.LOSS.PARAMS = None

# test
config.TEST = edict()
config.TEST.RECALL = []
config.TEST.TIOU = []
config.TEST.NMS_THRESH = 0.4
config.TEST.INTERVAL = 1
config.TEST.EVAL_TRAIN = False
config.TEST.BATCH_SIZE = 1
config.TEST.TOP_K = 10

def _update_dict(cfg, value):
    for k, v in value.items():
        if k in cfg:
            if k == 'PARAMS':
                cfg[k] = v
            elif isinstance(v, dict):
                _update_dict(cfg[k],v)
            else:
                cfg[k] = v
        else:
            raise ValueError("{} not exist in config.py".format(k))

def update_config(config_file):
    with open(config_file) as f:
        exp_config = edict(yaml.load(f, Loader=yaml.FullLoader))
        for k, v in exp_config.items():
            if k in config:
                if isinstance(v, dict):
                    _update_dict(config[k], v)
                else:
                    config[k] = v
            else:
                raise ValueError("{} not exist in config.py".format(k))


In [2]:
"""
Eval: only things needed
"""
import json
import argparse
import numpy as np
#from terminaltables import AsciiTable #!pip install terminaltables
'''
from core.config import config, update_config
'''

def iou(pred, gt): # require pred and gt is numpy
    assert isinstance(pred, list) and isinstance(gt,list)
    pred_is_list = isinstance(pred[0],list)
    gt_is_list = isinstance(gt[0],list)
    if not pred_is_list: pred = [pred]
    if not gt_is_list: gt = [gt]
    pred, gt = np.array(pred), np.array(gt)
    inter_left = np.maximum(pred[:,0,None], gt[None,:,0])
    inter_right = np.minimum(pred[:,1,None], gt[None,:,1])
    inter = np.maximum(0.0, inter_right - inter_left)
    union_left = np.minimum(pred[:,0,None], gt[None,:,0])
    union_right = np.maximum(pred[:,1,None], gt[None,:,1])
    union = np.maximum(0.0, union_right - union_left)
    overlap = 1.0 * inter / union
    if not gt_is_list:
        overlap = overlap[:,0]
    if not pred_is_list:
        overlap = overlap[0]
    return overlap


def rank(pred, gt):
    return pred.index(gt) + 1

def nms(dets, thresh=0.4, top_k=-1):
    """Pure Python NMS baseline."""
    if len(dets) == 0: return []
    order = np.arange(0,len(dets),1)
    dets = np.array(dets)
    x1 = dets[:, 0]
    x2 = dets[:, 1]
    lengths = x2 - x1
    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        if len(keep) == top_k:
            break
        xx1 = np.maximum(x1[i], x1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        inter = np.maximum(0.0, xx2 - xx1)
        ovr = inter / (lengths[i] + lengths[order[1:]] - inter)
        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return dets[keep]

def eval(segments, data):
    tious = [float(i) for i in config.TEST.TIOU.split(',')] if isinstance(config.TEST.TIOU,str) else [config.TEST.TIOU]
    recalls = [int(i) for i in config.TEST.RECALL.split(',')] if isinstance(config.TEST.RECALL,str) else [config.TEST.RECALL]

    eval_result = [[[] for _ in recalls] for _ in tious]
    max_recall = max(recalls)
    average_iou = []
    for seg, dat in zip(segments, data):
        seg = nms(seg, thresh=config.TEST.NMS_THRESH, top_k=max_recall).tolist()
        overlap = iou(seg, [dat['times']])
        average_iou.append(np.mean(np.sort(overlap[0])[-3:]))

        for i,t in enumerate(tious):
            for j,r in enumerate(recalls):
                eval_result[i][j].append((overlap > t)[:r].any())
    eval_result = np.array(eval_result).mean(axis=-1)
    miou = np.mean(average_iou)


    return eval_result, miou

def eval_predictions(segments, data, verbose=True):
    eval_result, miou = eval(segments, data)
    if verbose:
        print(display_results(eval_result, miou, ''))

    return eval_result, miou

def display_results(eval_result, miou, title=None):
    tious = [float(i) for i in config.TEST.TIOU.split(',')] if isinstance(config.TEST.TIOU,str) else [config.TEST.TIOU]
    recalls = [int(i) for i in config.TEST.RECALL.split(',')] if isinstance(config.TEST.RECALL,str) else [config.TEST.RECALL]

    display_data = [['Rank@{},mIoU@{}'.format(i,j) for i in recalls for j in tious]+['mIoU']]
    eval_result = eval_result*100
    miou = miou*100
    display_data.append(['{:.02f}'.format(eval_result[j][i]) for i in range(len(recalls)) for j in range(len(tious))]
                        +['{:.02f}'.format(miou)])
    print(display_data)
    #table = AsciiTable(display_data, title)
    #for i in range(len(tious)*len(recalls)):
    #    table.justify_columns[i] = 'center'
    return table.table


def parse_args():
    parser = argparse.ArgumentParser(description='Train localization network')

    # general
    parser.add_argument('--cfg', help='experiment configure file name', required=True, type=str)
    args, rest = parser.parse_known_args()

    # update config
    update_config(args.cfg)

    parser.add_argument('--verbose', default=False, action="store_true", help='print progress bar')
    args = parser.parse_args()

    return args

def reset_config(config, args):
    if args.verbose:
        config.VERBOSE = args.verbose

if __name__ == '__main__':
    args = parse_args()
    reset_config(config, args)
    train_data = json.load(open('/data/home2/hacker01/Data/DiDeMo/train_data.json', 'r'))
    val_data = json.load(open('/data/home2/hacker01/Data/DiDeMo/val_data.json', 'r'))

    moment_frequency_dict = {}
    for d in train_data:
        times = [t for t in d['times']]
        for time in times:
            time = tuple(time)
            if time not in moment_frequency_dict.keys():
                moment_frequency_dict[time] = 0
            moment_frequency_dict[time] += 1

    prior = sorted(moment_frequency_dict, key=moment_frequency_dict.get, reverse=True)
    prior = [list(item) for item in prior]
    prediction = [prior for d in val_data]

    eval_predictions(prediction, val_data)

usage: ipykernel_launcher.py [-h] --cfg CFG
ipykernel_launcher.py: error: the following arguments are required: --cfg


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [3]:
"""
Engine class
"""
class Engine(object):
    def __init__(self):
        self.hooks = {}

    def hook(self, name, state):

        if name in self.hooks:
            self.hooks[name](state)

    def train(self, network, iterator, maxepoch, optimizer, scheduler):
        state = {
            'network': network,
            'iterator': iterator,
            'maxepoch': maxepoch,
            'optimizer': optimizer,
            'scheduler': scheduler,
            'epoch': 0,
            't': 0,
            'train': True,
        }

        self.hook('on_start', state)
        while state['epoch'] < state['maxepoch']:
            print(f"EPOCH {state['epoch']}/{state['maxepoch']}")
            self.hook('on_start_epoch', state)
            for sample in state['iterator']:
                state['sample'] = sample
                self.hook('on_sample', state)

                def closure():
                    loss, output = state['network'](state['sample'])
                    state['output'] = output
                    state['loss'] = loss
                    loss.backward()
                    self.hook('on_forward', state)
                    # to free memory in save_for_backward
                    state['output'] = None
                    state['loss'] = None
                    return loss

                state['optimizer'].zero_grad()
                state['optimizer'].step(closure)
                self.hook('on_update', state)
                state['t'] += 1
            state['epoch'] += 1
            self.hook('on_end_epoch', state)
        self.hook('on_end', state)
        return state

    def test(self, network, iterator, split):
        state = {
            'network': network,
            'iterator': iterator,
            'split': split,
            't': 0,
            'train': False,
        }

        self.hook('on_test_start', state)
        for sample in state['iterator']:
            state['sample'] = sample
            self.hook('on_test_sample', state)

            def closure():
                loss, output = state['network'](state['sample'])
                state['output'] = output
                state['loss'] = loss
                self.hook('on_test_forward', state)
                # to free memory in save_for_backward
                state['output'] = None
                state['loss'] = None

            closure()
            state['t'] += 1
        self.hook('on_test_end', state)
        return state

In [4]:
"""
utils modules
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import logging
import time
from pathlib import Path

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def create_logger(cfg, cfg_name, tag='train'):
    root_log_dir = Path(cfg.LOG_DIR)
    # set up logger
    if not root_log_dir.exists():
        print('=> creating {}'.format(root_log_dir))
        root_log_dir.mkdir()

    dataset = cfg.DATASET.NAME
    cfg_name = os.path.basename(cfg_name).split('.yaml')[0]

    final_log_dir = root_log_dir / dataset / cfg_name

    print('=> creating {}'.format(final_log_dir))
    final_log_dir.mkdir(parents=True, exist_ok=True)

    time_str = time.strftime('%Y-%m-%d-%H-%M')
    log_file = '{}_{}_{}.log'.format(cfg_name, time_str, tag)
    final_log_file = final_log_dir / log_file
    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(filename=str(final_log_file), format=head)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    console = logging.StreamHandler()
    logging.getLogger('').addHandler(console)

    return logger, str(final_log_dir)

**Dataset modules**

In [5]:
"""
Path to dataset files
"""
class Path(object):
    ## Dataset files
    @staticmethod
    def annotations_file():
        ## [video name] [start time] [end time]##[sentence]
        return ('/projectnb/cs591-mm-ml/KuJu/dataset_charades_sta/charades_sta_train.txt',
                '/projectnb/cs591-mm-ml/KuJu/dataset_charades_sta/charades_sta_test.txt')
    @staticmethod
    def infos_file():
        ## id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length
        return ('/projectnb/cs591-mm-ml/KuJu/dataset_charades_sta/charades_sta_infos_train.csv',
                '/projectnb/cs591-mm-ml/KuJu/dataset_charades_sta/charades_sta_infos_test.csv')
    @staticmethod
    def video_folder():
        return '/projectnb/cs591-mm-ml/KuJu/dataset_charades_sta/charades_480/'
    @staticmethod
    def c3d_model_file():
        return 'c3d.pickle'
    @staticmethod
    def embedding_file():
        return '/projectnb/cs591-mm-ml/KuJu/word_embedding/glove.6B.300d.txt'
    
    ## CONFIG init
    @staticmethod
    def config_init_1():
        return '2D-TAN-16x16-K5L8-conv.yaml'
    @staticmethod
    def config_init_2():
        return '2D-TAN-16x16-K5L8-pool.yaml'
    
    
    ## Generated
    @staticmethod
    def word2id_file():
        return '/projectnb/cs591-mm-ml/KuJu/dataset_charades_sta/word2id.json'
    @staticmethod
    def video_features_folder():
        return '/projectnb/cs591-mm-ml/KuJu/dataset_charades_sta/video_features/'

In [6]:
"""
Init Dataset modules
"""
import torch
import torch.nn as nn
#from core.config import config

def collate_fn(batch):
    batch_word_vectors = [b['word_vectors'] for b in batch]
    batch_txt_mask = [b['txt_mask'] for b in batch]
    batch_map_gt = [b['map_gt'] for b in batch]
    batch_anno_idxs = [b['anno_idx'] for b in batch]
    batch_vis_feats = [b['visual_input'] for b in batch]
    batch_duration = [b['duration'] for b in batch]

    max_num_clips = max([map_gt.shape[-1] for map_gt in batch_map_gt])
    padded_batch_map_gt = torch.zeros(len(batch_map_gt), 1, max_num_clips, max_num_clips)
    for i, map_gt in enumerate(batch_map_gt):
        num_clips = map_gt.shape[-1]
        padded_batch_map_gt[i][0,:num_clips,:num_clips] = map_gt

    batch_data = {
        'batch_anno_idxs': batch_anno_idxs,
        'batch_word_vectors': nn.utils.rnn.pad_sequence(batch_word_vectors, batch_first=True),
        'batch_txt_mask': nn.utils.rnn.pad_sequence(batch_txt_mask, batch_first=True),
        'batch_map_gt': padded_batch_map_gt,
        'batch_vis_input': nn.utils.rnn.pad_sequence(batch_vis_feats, batch_first=True).float(),
        'batch_duration': batch_duration,
    }

    return batch_data

def average_to_fixed_length(visual_input):
    num_sample_clips = config.DATASET.NUM_SAMPLE_CLIPS
    num_clips = visual_input.shape[0]
    idxs = torch.arange(0, num_sample_clips+1, 1.0)/num_sample_clips*num_clips
    idxs = torch.min(torch.round(idxs).long(),torch.tensor(num_clips-1))
    new_visual_input = []
    for i in range(num_sample_clips):
        s_idx, e_idx = idxs[i].item(), idxs[i+1].item()
        if s_idx < e_idx:
            new_visual_input.append(torch.mean(visual_input[s_idx:e_idx],dim=0))
        else:
            new_visual_input.append(visual_input[s_idx])
    new_visual_input = torch.stack(new_visual_input, dim=0)
    return new_visual_input

In [7]:
"""
The C3D network.
"""
import torch
import torch.nn as nn

class C3D(nn.Module):
    def __init__(self):
        super(C3D, self).__init__()

        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, 487)

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):
        '''
        forward pass without the lasts layers.
        '''
        h = self.relu(self.conv1(x))
        h = self.pool1(h)

        h = self.relu(self.conv2(h))
        h = self.pool2(h)

        h = self.relu(self.conv3a(h))
        h = self.relu(self.conv3b(h))
        h = self.pool3(h)

        h = self.relu(self.conv4a(h))
        h = self.relu(self.conv4b(h))
        h = self.pool4(h)

        h = self.relu(self.conv5a(h))
        h = self.relu(self.conv5b(h))
        h = self.pool5(h)

        h = h.view(-1, 8192)
        h = self.fc6(h)
        h = self.relu(h)

        # h = self.dropout(h)
        # h = self.relu(self.fc7(h))
        # h = self.dropout(h)

        # logits = self.fc8(h)
        # probs = self.softmax(logits)

        # return probs
        return h.detach()

In [8]:
""" 
Dataset loader for the Charades-STA dataset 
"""
import os
import csv
import h5py
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data as data
from torchtext import vocab

import cv2
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
import torchvision
from torchvision import transforms
import torchvision.models as models
from PIL import Image
import matplotlib.pyplot as plt

## Custom module dependencies
'''from . import average_to_fixed_length # "from ." means from init
from core.eval import iou
from core.config import config
from path import Path as PATH
from C3D_model import C3D'''
PATH = Path()

class Charades(data.Dataset):

    ## Embedding from torchtext lib
    vocab = vocab.pretrained_aliases["glove.6B.300d"]()
    vocab.itos.extend(['<unk>'])
    vocab.stoi['<unk>'] = vocab.vectors.shape[0]
    vocab.vectors = torch.cat([vocab.vectors, torch.zeros(1, vocab.dim)], dim=0)
    word_embedding = nn.Embedding.from_pretrained(vocab.vectors)
    ## Embedding from file
    '''with open(PATH.embedding_file(),'r') as f:
        vocab = f.readlines()
    vocab = {line.split()[0]:np.asarray(line.split()[1:], "float32") for line in vocab}
    vocab['<unk>'] = np.zeros([1, 300], dtype = "float32")
    word_embedding = nn.Embedding.from_pretrained(torch.Tensor(list(glove.values())))'''

    def __init__(self, split):
        super(Charades, self).__init__()

        ## Init vars
        #self.vis_input_type = config.DATASET.VIS_INPUT_TYPE
        #self.data_dir = config.DATA_DIR
        self.split = split

        ## Get duration
        self.durations = {}
        with open(PATH.infos_file()[self.split]) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.durations[row['id']] = float(row['length'])

        ## Get annotations and infos
        anno_file = open(PATH.annotations_file()[self.split],'r')
        annotations = []
        for line in anno_file:
            anno, sent = line.split("##")
            sent = sent.split('.\n')[0]
            vid, s_time, e_time = anno.split(" ")
            s_time = float(s_time)
            e_time = min(float(e_time), self.durations[vid])
            if s_time < e_time:
                annotations.append({'video':vid, 'times':[s_time, e_time], 'description': sent, 'duration': self.durations[vid]})
        anno_file.close()
        self.annotations = annotations

    def __getitem__(self, index):
        ## Init Vars
        video_id = self.annotations[index]['video']
        gt_s_time, gt_e_time = self.annotations[index]['times']
        description = self.annotations[index]['description']
        duration = self.annotations[index]['duration'] 
        
        ## Get Word Features
        word_idxs = torch.tensor([self.vocab.stoi.get(w.lower(), 400000) for w in description.split()], dtype=torch.long)
        #word_idxs = torch.tensor([list(self.vocab.keys()).index(w.lower()) if w in list(self.vocab.keys()) else list(self.vocab.keys()).index('<unk>') for w in description.split()], dtype=torch.long)
        word_vectors = self.word_embedding(word_idxs)

        ## Get Video Features
        visual_input, visual_mask = self.get_video_features(video_id)
        
        # Time scaled to same size
        if config.DATASET.NUM_SAMPLE_CLIPS > 0:
            # visual_input = sample_to_fixed_length(visual_input, random_sampling=True)
            visual_input = average_to_fixed_length(visual_input)
            num_clips = config.DATASET.NUM_SAMPLE_CLIPS//config.DATASET.TARGET_STRIDE
            s_times = torch.arange(0,num_clips).float()*duration/num_clips
            e_times = torch.arange(1,num_clips+1).float()*duration/num_clips
            overlaps = iou(torch.stack([s_times[:,None].expand(-1,num_clips),
                                        e_times[None,:].expand(num_clips,-1)],dim=2).view(-1,2).tolist(),
                           torch.tensor([gt_s_time, gt_e_time]).tolist()).reshape(num_clips,num_clips)

        # Time unscaled NEED FIXED WINDOW SIZE
        else:
            num_clips = visual_input.shape[0]//config.DATASET.TARGET_STRIDE
            raise NotImplementedError

        item = {
            'visual_input': visual_input,
            'vis_mask': visual_mask,
            'anno_idx': index,
            'word_vectors': word_vectors,
            'duration': duration,
            'txt_mask': torch.ones(word_vectors.shape[0], 1),
            'map_gt': torch.from_numpy(overlaps),
        }

        return item

    
    def __len__(self):
        return len(self.annotations)

    def get_video_features(self, vid):
        #assert config.DATASET.VIS_INPUT_TYPE == 'c3d'
        #with h5py.File(os.path.join(self.data_dir, 'sub_activitynet_v1-3.c3d.hdf5'), 'r') as f:
        #    features = torch.from_numpy(f[vid]['c3d_features'][:])
        if os.path.exists(PATH.video_features_folder() + vid + '.npy'):
            ## If features file exist
            #print(f'Feature file for {vid} exists')
            features = np.load(PATH.video_features_folder() + vid + '.npy')
            features = torch.from_numpy(features)
        elif os.path.exists(PATH.video_folder() + vid + '.mp4'):
            ## If not compute at the moment from .mp4 video
            #print(f'Extract features from raw video for {vid}')
            features = self.extract_from_raw_video(PATH.video_folder() + vid + '.mp4', vid)
            features = torch.from_numpy(features)
        
        #if config.DATASET.NORMALIZE:
        #    features = F.normalize(features,dim=1)
            
        vis_mask = torch.ones((features.shape[0], 1))
        return features, vis_mask
    

    def extract_from_raw_video(self, input_path, vid):
        '''
        3D CNN expects input like [batch, channels, clip_frames, H, W]
        From cv2 I obtain [frames, channels, H, W]
        Transform it to [clip, clip_frames, channels, H, W]
        By consequence I consider a batch as a number of clips of frames (batch-->clips-->frames)
        '''
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        transform = transforms.Compose([transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), normalize])
        #transform = transforms.Compose([transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor()])
    
        ## Extract frames images from video
        cap= cv2.VideoCapture(input_path)
        frame_list = []
        while(cap.isOpened()):
            ret, frame = cap.read()
            if ret == False:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            #frame.save('test'+str(i)+'.png')
            frame = transform(frame)
            frame = Variable(frame) #frame.unsqueeze(0).cuda())
            frame_list.append(frame)
        cap.release()
        
        ## Clip extracted frames list
        clip_size = 16
        clip_stride = clip_size #int(clip_size/2) #will trow away last clip_stride frames to have every clip of same lenght
        frame_clip_list = [torch.stack(frame_list[x:x+clip_size]) for x in range(0, len(frame_list)-clip_size, clip_stride)]
        frame_clip_list = torch.stack(frame_clip_list)
        #print(f'All Clips shape: {frame_clip_list.shape}') #torch.Size([62, 16, 3, 224, 224]) [Batches_clips, clips, chan, H, W]
        
        ## Create batches: I do not use Dataset custom class because only for feature extraction
        dataloader = DataLoader(frame_clip_list, batch_size = 16, shuffle = False)
        
        ## Visualize batch
        '''def imshow(img):
            #img = img/2 + 0.5 #unnormalize
            print(f'Frames plot shape: {img.shape}')
            npimg = img.cpu().numpy()
            npimg = np.transpose(npimg, (1, 2, 0))
            plt.imshow(npimg)
        dataiter = iter(dataloader)
        clip = dataiter.next() #[clip, depth, in_channels, height, width]
        images = clip[0] #[depth, in_channels, height, width]
        print(f'One Batch shape: {clip.shape}')
        print(f'One Clip shape: {images.shape}')
        imshow(torchvision.utils.make_grid(images))'''
        
        ## Init model
        if config.CUDNN.ENABLED:
            model = C3D().cuda()
            model.load_state_dict(torch.load(PATH.c3d_model_file()))
            torch.backends.cudnn.benchmark = True
            model = torch.nn.DataParallel(model)
        else:
            raise NotImplementedError
            
        ## Extract features for each clip of frames
        vis_embed = []
        for sample in dataloader: # input: [batch, clip_frames, in_channels, height, width]
            with torch.no_grad():
                sample = np.transpose(sample, (0, 2, 1, 3, 4)) # swap clip_frames and channels
                sample = sample.cuda()
                feat = model(sample) #input expected: [batch, in_channels, clip_frames, height, width]
                feat = feat.cpu().numpy()
            vis_embed.append(feat)
        vis_embed = np.vstack(vis_embed)
        
        ## Save features video in file
        output_file_path = PATH.video_features_folder()+vid+'.npy'
        np.save(output_file_path, vis_embed)
        return vis_embed

In [9]:
print(torch.__version__)

1.8.1+cu102


In [10]:
train_dataset = Charades(0)

In [11]:
print(train_dataset)
print(len(train_dataset.annotations))

<__main__.Charades object at 0x2b76c00d7290>
12404


In [12]:
# get an iterator using iter()
my_iter = iter(train_dataset)

# iterate through it using next()
while True:
    try:
        el = next(my_iter)
    except StopIteration:
        break

Extract features from raw video for AO8RW
Extract features from raw video for Y6R7T
Feature file for Y6R7T exists
Feature file for Y6R7T exists
Feature file for Y6R7T exists
Feature file for Y6R7T exists
Extract features from raw video for IGDIE
Extract features from raw video for 7UPGT
Extract features from raw video for KBPDM
Feature file for KBPDM exists
Feature file for KBPDM exists
Extract features from raw video for IBIWF
Extract features from raw video for 372CC
Feature file for 372CC exists
Extract features from raw video for N14BK
Extract features from raw video for NQT1S
Extract features from raw video for F1VEE
Feature file for F1VEE exists
Feature file for F1VEE exists
Feature file for F1VEE exists
Feature file for F1VEE exists
Extract features from raw video for YDWN5
Extract features from raw video for QRWQ3
Feature file for QRWQ3 exists
Extract features from raw video for SFHHR
Extract features from raw video for HWYTN
Feature file for HWYTN exists
Feature file for HWYTN

Feature file for ZHOP3 exists
Extract features from raw video for 1G9R7
Extract features from raw video for HPAYB
Extract features from raw video for SKLEN
Feature file for SKLEN exists
Feature file for SKLEN exists
Extract features from raw video for L347E
Extract features from raw video for B5YYS
Extract features from raw video for 2074D
Feature file for 2074D exists
Extract features from raw video for 9PAQ4
Feature file for 9PAQ4 exists
Feature file for 9PAQ4 exists
Extract features from raw video for 6AJX0
Feature file for 6AJX0 exists
Feature file for 6AJX0 exists
Extract features from raw video for ACMHK
Extract features from raw video for TQGPM
Extract features from raw video for AK9IB
Extract features from raw video for UQW95
Feature file for UQW95 exists
Feature file for UQW95 exists
Feature file for UQW95 exists
Feature file for UQW95 exists
Extract features from raw video for BTW1H
Feature file for BTW1H exists
Extract features from raw video for OE751
Feature file for OE751

Feature file for WF3NY exists
Extract features from raw video for NC1OC
Feature file for NC1OC exists
Feature file for NC1OC exists
Feature file for NC1OC exists
Extract features from raw video for S1N2U
Feature file for S1N2U exists
Feature file for S1N2U exists
Feature file for S1N2U exists
Extract features from raw video for G3CDA
Extract features from raw video for 00ZCA
Feature file for 00ZCA exists
Feature file for 00ZCA exists
Feature file for 00ZCA exists
Extract features from raw video for R6QCQ
Feature file for R6QCQ exists
Extract features from raw video for YV521
Feature file for YV521 exists
Feature file for YV521 exists
Feature file for YV521 exists
Feature file for YV521 exists
Feature file for YV521 exists
Extract features from raw video for 67EEN
Extract features from raw video for ANJNU
Feature file for ANJNU exists
Feature file for ANJNU exists
Feature file for ANJNU exists
Extract features from raw video for J12SC
Feature file for J12SC exists
Feature file for J12SC

Feature file for 41CZS exists
Feature file for 41CZS exists
Extract features from raw video for V34GF
Feature file for V34GF exists
Extract features from raw video for 01KML
Extract features from raw video for FERH5
Feature file for FERH5 exists
Feature file for FERH5 exists
Feature file for FERH5 exists
Extract features from raw video for NOJK1
Feature file for NOJK1 exists
Feature file for NOJK1 exists
Feature file for NOJK1 exists
Extract features from raw video for V3RAX
Feature file for V3RAX exists
Feature file for V3RAX exists
Feature file for V3RAX exists
Extract features from raw video for QNKDI
Feature file for QNKDI exists
Extract features from raw video for GIC6A
Feature file for GIC6A exists
Feature file for GIC6A exists
Extract features from raw video for NCDE3
Feature file for NCDE3 exists
Feature file for NCDE3 exists
Extract features from raw video for UB3KO
Feature file for UB3KO exists
Feature file for UB3KO exists
Extract features from raw video for FIV2P
Extract fe

Extract features from raw video for 6JKD6
Feature file for 6JKD6 exists
Feature file for 6JKD6 exists
Feature file for 6JKD6 exists
Extract features from raw video for Q4TKG
Feature file for Q4TKG exists
Extract features from raw video for DJG7A
Extract features from raw video for Q5ZIL
Feature file for Q5ZIL exists
Feature file for Q5ZIL exists
Extract features from raw video for 8VSV6
Extract features from raw video for EHYXP
Extract features from raw video for C9VPX
Feature file for C9VPX exists
Extract features from raw video for 5AR9B
Feature file for 5AR9B exists
Extract features from raw video for F082Z
Extract features from raw video for 1JGRO
Extract features from raw video for 81YUE
Feature file for 81YUE exists
Feature file for 81YUE exists
Extract features from raw video for WE2PF
Feature file for WE2PF exists
Extract features from raw video for EG1XK
Feature file for EG1XK exists
Feature file for EG1XK exists
Feature file for EG1XK exists
Extract features from raw video fo

Feature file for K21RO exists
Feature file for K21RO exists
Extract features from raw video for GRJG1
Extract features from raw video for CCCUJ
Feature file for CCCUJ exists
Feature file for CCCUJ exists
Feature file for CCCUJ exists
Extract features from raw video for 29C6X
Extract features from raw video for LKSBL
Feature file for LKSBL exists
Feature file for LKSBL exists
Feature file for LKSBL exists
Extract features from raw video for 3GY40
Feature file for 3GY40 exists
Feature file for 3GY40 exists
Feature file for 3GY40 exists
Extract features from raw video for UR7C8
Feature file for UR7C8 exists
Feature file for UR7C8 exists
Extract features from raw video for 8Y4YD
Extract features from raw video for 1TWH6
Feature file for 1TWH6 exists
Feature file for 1TWH6 exists
Feature file for 1TWH6 exists
Feature file for 1TWH6 exists
Extract features from raw video for 28D7L
Feature file for 28D7L exists
Extract features from raw video for 4X2JC
Feature file for 4X2JC exists
Extract fe

Feature file for TVS1P exists
Extract features from raw video for VXRCZ
Feature file for VXRCZ exists
Feature file for VXRCZ exists
Extract features from raw video for JUGS8
Feature file for JUGS8 exists
Feature file for JUGS8 exists
Extract features from raw video for L62J5
Extract features from raw video for PHIIX
Extract features from raw video for ETAFB
Extract features from raw video for 5AHQV
Feature file for 5AHQV exists
Feature file for 5AHQV exists
Feature file for 5AHQV exists
Extract features from raw video for OFNQV
Extract features from raw video for DGSBQ
Extract features from raw video for TYKVA
Feature file for TYKVA exists
Extract features from raw video for PZQIN
Feature file for PZQIN exists
Extract features from raw video for E7DKD
Extract features from raw video for B53VP
Feature file for B53VP exists
Feature file for B53VP exists
Extract features from raw video for H8QM1
Extract features from raw video for HAR5P
Feature file for HAR5P exists
Feature file for HAR5P

KeyboardInterrupt: 

**Model modules**

In [9]:
"""
Module loss
"""
import torch
import torch.nn.functional as F

def bce_rescale_loss(scores, masks, targets, cfg):
    min_iou, max_iou, bias = cfg.MIN_IOU, cfg.MAX_IOU, cfg.BIAS
    joint_prob = torch.sigmoid(scores) * masks
    target_prob = (targets-min_iou)*(1-bias)/(max_iou-min_iou)
    target_prob[target_prob > 0] += bias
    target_prob[target_prob > 1] = 1
    target_prob[target_prob < 0] = 0
    loss = F.binary_cross_entropy(joint_prob, target_prob, reduction='none') * masks
    loss_value = torch.sum(loss) / torch.sum(masks)
    return loss_value, joint_prob

In [10]:
"""
Module dependences for module TAN
"""
from torch import nn
'''from core.config import config
import models.frame_modules as frame_modules
import models.prop_modules as prop_modules
import models.map_modules as map_modules
import models.fusion_modules as fusion_modules
'''
#*********************************************************** FRAME POOL
class FrameAvgPool(nn.Module):

    def __init__(self, cfg):
        super(FrameAvgPool, self).__init__()
        input_size = cfg.INPUT_SIZE
        hidden_size = cfg.HIDDEN_SIZE
        kernel_size = cfg.KERNEL_SIZE
        stride = cfg.STRIDE
        self.vis_conv = nn.Conv1d(input_size, hidden_size, 1, 1)
        self.avg_pool = nn.AvgPool1d(kernel_size, stride)

    def forward(self, visual_input):
        vis_h = torch.relu(self.vis_conv(visual_input))
        vis_h = self.avg_pool(vis_h)
        return vis_h

class FrameMaxPool(nn.Module):

    def __init__(self, input_size, hidden_size, stride):
        super(FrameMaxPool, self).__init__()
        self.vis_conv = nn.Conv1d(input_size, hidden_size, 1, 1)
        self.max_pool = nn.MaxPool1d(stride)

    def forward(self, visual_input):
        vis_h = torch.relu(self.vis_conv(visual_input))
        vis_h = self.max_pool(vis_h)
        return vis_h
#*********************************************************** BASE FUSION
class BaseFusion(nn.Module):

    def __init__(self, cfg):
        super(BaseFusion, self).__init__()
        self.cfg = cfg
        hidden_size = cfg.HIDDEN_SIZE
        txt_input_size = cfg.TXT_INPUT_SIZE
        txt_hidden_size = cfg.TXT_HIDDEN_SIZE
        self.textual_encoder = nn.LSTM(txt_input_size, txt_hidden_size//2 if cfg.LSTM.BIDIRECTIONAL else txt_hidden_size,
                                       num_layers=cfg.LSTM.NUM_LAYERS, bidirectional=cfg.LSTM.BIDIRECTIONAL, batch_first=True)
        self.tex_linear = nn.Linear(txt_hidden_size, hidden_size)
        self.vis_conv = nn.Conv2d(hidden_size, hidden_size, 1, 1)

    def forward(self, textual_input, textual_mask, map_h, map_mask):
        self.textual_encoder.flatten_parameters()
        txt_h = self.textual_encoder(textual_input)[0] * textual_mask
        txt_h = torch.stack([txt_h[i][torch.sum(mask).long() - 1] for i, mask in enumerate(textual_mask)])
        txt_h = self.tex_linear(txt_h)[:,:,None,None]
        map_h = self.vis_conv(map_h)
        fused_h = F.normalize(txt_h * map_h) * map_mask
        return fused_h

#*********************************************************** PROP MOD
class PropMaxPool(nn.Module):
    def __init__(self, cfg):
        super(PropMaxPool, self).__init__()
        num_layers = cfg.NUM_LAYERS
        self.layers = nn.ModuleList(
            [nn.Identity()]
            +[nn.MaxPool1d(2, stride=1) for _ in range(num_layers-1)]
        )
        self.num_layers = num_layers

    def forward(self, x):
        batch_size, hidden_size, num_clips = x.shape
        map_h = x.new_zeros(batch_size, hidden_size, num_clips, num_clips).cuda()
        map_mask = x.new_zeros(batch_size, 1, num_clips, num_clips).cuda()

        for dig_idx, pool in enumerate(self.layers):
            x = pool(x)
            start_idxs = [s_idx for s_idx in range(0, num_clips - dig_idx, 1)]
            end_idxs = [s_idx + dig_idx for s_idx in start_idxs]
            map_h[:, :, start_idxs, end_idxs] = x
            map_mask[:, :, start_idxs, end_idxs] += 1

        return map_h, map_mask
class SparsePropMaxPool(nn.Module):
    def __init__(self, cfg):
        super(SparsePropMaxPool, self).__init__()
        self.num_scale_layers = cfg.NUM_SCALE_LAYERS

        self.layers = nn.ModuleList()

        for scale_idx, num_layer in enumerate(self.num_scale_layers):
            scale_layers = nn.ModuleList()
            first_layer = nn.MaxPool1d(1,1) if scale_idx == 0 else nn.MaxPool1d(3,2)
            rest_layers = [nn.MaxPool1d(2,1) for _ in range(1, num_layer)]
            scale_layers.extend([first_layer]+rest_layers)
            self.layers.append(scale_layers)

    def forward(self, x):
        map_h_list = []
        map_mask_list = []

        for scale_idx, scale_layers in enumerate(self.layers):
            batch_size, hidden_size, num_scale_clips = x.shape
            num_scale_clips = num_scale_clips//scale_layers[0].stride
            map_h = x.new_zeros(batch_size, hidden_size, num_scale_clips, num_scale_clips)
            map_mask = x.new_zeros(batch_size, 1, num_scale_clips, num_scale_clips)
            for i, layer in enumerate(scale_layers):
                try:
                    x = layer(x)
                except:
                    pass
                scale_s_idxs = list(range(0, num_scale_clips - i, 1))
                scale_e_idxs = [s_idx + i for s_idx in scale_s_idxs]
                map_h[:,:,scale_s_idxs, scale_e_idxs] = x
                map_mask[:,:,scale_s_idxs, scale_e_idxs] = 1
            map_h_list.append(map_h)
            map_mask_list.append(map_mask)

        ori_map_h, ori_map_mask = self.recover_to_original_map(map_h_list, map_mask_list)
        return ori_map_h, ori_map_mask

    def recover_to_original_map(self, h_list, mask_list):
        # resize to original scale
        batch_size, hidden_size, ori_num_clips, _ = h_list[0].shape

        ori_map_h = h_list[0].new_zeros(batch_size, hidden_size, ori_num_clips, ori_num_clips)
        ori_map_mask = mask_list[0].new_zeros(batch_size, 1, ori_num_clips, ori_num_clips)
        acum_layers = 0
        stride = 1
        for scale_layers, h, mask in zip(self.layers, h_list, mask_list):
            num_scale_clips = h.shape[-1]
            for i, layer in enumerate(scale_layers):
                stride = stride * layer.stride
                scale_s_idxs = list(range(0,num_scale_clips-i,1))
                scale_e_idxs = [s_idx+i for s_idx in scale_s_idxs]
                ori_s_idxs = list(range(0,ori_num_clips-acum_layers-i*stride,stride))
                ori_e_idxs = [s_idx+acum_layers+i*stride for s_idx in ori_s_idxs]
                ori_map_h[:,:, ori_s_idxs, ori_e_idxs] = h[:,:, scale_s_idxs, scale_e_idxs]
                ori_map_mask[:,:, ori_s_idxs, ori_e_idxs] = 1

            acum_layers += stride * (len(scale_layers)+1)

        return ori_map_h, ori_map_mask

class SparsePropConv(nn.Module):
    def __init__(self, cfg):
        super(SparsePropConv, self).__init__()
        self.num_scale_layers = cfg.NUM_SCALE_LAYERS
        self.hidden_size = cfg.HIDDEN_SIZE

        self.layers = nn.ModuleList()

        for scale_idx, num_layer in enumerate(self.num_scale_layers):
            scale_layers = nn.ModuleList()
            first_layer = nn.Conv1d(self.hidden_size, self.hidden_size, 1,1) if scale_idx == 0 else nn.Conv1d(self.hidden_size, self.hidden_size, 3, 2)
            rest_layers = [nn.Conv1d(self.hidden_size, self.hidden_size, 2,1) for _ in range(1, num_layer)]
            scale_layers.extend([first_layer]+rest_layers)
            self.layers.append(scale_layers)

    def forward(self, x):
        map_h_list = []
        map_mask_list = []

        for scale_idx, scale_layers in enumerate(self.layers):
            batch_size, hidden_size, num_scale_clips = x.shape
            num_scale_clips = num_scale_clips//scale_layers[0].stride[0]
            map_h = x.new_zeros(batch_size, hidden_size, num_scale_clips, num_scale_clips)
            map_mask = x.new_zeros(batch_size, 1, num_scale_clips, num_scale_clips)
            for i, layer in enumerate(scale_layers):
                x = layer(x)
                scale_s_idxs = list(range(0, num_scale_clips - i, 1))
                scale_e_idxs = [s_idx + i for s_idx in scale_s_idxs]
                map_h[:,:,scale_s_idxs, scale_e_idxs] = x
                map_mask[:,:,scale_s_idxs, scale_e_idxs] = 1
            map_h_list.append(map_h)
            map_mask_list.append(map_mask)


        ori_map_h, ori_map_mask = self.recover_to_original_map(map_h_list, map_mask_list)

        return ori_map_h, ori_map_mask

    def recover_to_original_map(self, h_list, mask_list):
        # resize to original scale
        batch_size, hidden_size, ori_num_clips, _ = h_list[0].shape

        ori_map_h = h_list[0].new_zeros(batch_size, hidden_size, ori_num_clips, ori_num_clips)
        ori_map_mask = mask_list[0].new_zeros(batch_size, 1, ori_num_clips, ori_num_clips)
        acum_layers = 0
        stride = 1
        for scale_layers, h, mask in zip(self.layers, h_list, mask_list):
            num_scale_clips = h.shape[-1]
            for i, layer in enumerate(scale_layers):
                stride = stride * layer.stride[0]
                scale_s_idxs = list(range(0,num_scale_clips-i,1))
                scale_e_idxs = [s_idx+i for s_idx in scale_s_idxs]
                ori_s_idxs = list(range(0,ori_num_clips-acum_layers-i*stride,stride))
                ori_e_idxs = [s_idx+acum_layers+i*stride for s_idx in ori_s_idxs]
                ori_map_h[:,:, ori_s_idxs, ori_e_idxs] = h[:,:, scale_s_idxs, scale_e_idxs]
                ori_map_mask[:,:, ori_s_idxs, ori_e_idxs] = 1

            acum_layers += stride * (len(scale_layers)+1)

        return ori_map_h, ori_map_mask
#*********************************************************** MAP CONV
def get_padded_mask_and_weight(*args):
    if len(args) == 2:
        mask, conv = args
        masked_weight = torch.round(F.conv2d(mask.clone().float(), torch.ones(1, 1, *conv.kernel_size).cuda(),
                                             stride=conv.stride, padding=conv.padding, dilation=conv.dilation))
    elif len(args) == 5:
        mask, k, s, p, d = args
        masked_weight = torch.round(F.conv2d(mask.clone().float(), torch.ones(1, 1, k, k).cuda(), stride=s, padding=p, dilation=d))
    else:
        raise NotImplementedError

    masked_weight[masked_weight > 0] = 1 / masked_weight[masked_weight > 0] #conv.kernel_size[0] * conv.kernel_size[1]
    padded_mask = masked_weight > 0

    return padded_mask, masked_weight
class MapConv(nn.Module):

    def __init__(self, cfg):
        super(MapConv, self).__init__()
        input_size = cfg.INPUT_SIZE
        hidden_sizes = cfg.HIDDEN_SIZES
        kernel_sizes = cfg.KERNEL_SIZES
        strides = cfg.STRIDES
        paddings = cfg.PADDINGS
        dilations = cfg.DILATIONS
        self.convs = nn.ModuleList()
        assert len(hidden_sizes) == len(kernel_sizes) \
               and len(hidden_sizes) == len(strides) \
               and len(hidden_sizes) == len(paddings) \
               and len(hidden_sizes) == len(dilations)
        channel_sizes = [input_size]+hidden_sizes
        for i, (k, s, p, d) in enumerate(zip(kernel_sizes, strides, paddings, dilations)):
            self.convs.append(nn.Conv2d(channel_sizes[i], channel_sizes[i+1], k, s, p, d))

    def forward(self, x, mask):
        padded_mask = mask
        for i, pred in enumerate(self.convs):
            x = F.relu(pred(x))
            padded_mask, masked_weight = get_padded_mask_and_weight(padded_mask, pred)
            x = x * masked_weight
        return x

In [11]:
"""
Module main model TAN
"""
class TAN(nn.Module):
    def __init__(self):
        super(TAN, self).__init__()

        '''
        self.frame_layer = getattr(frame_modules, config.TAN.FRAME_MODULE.NAME)(config.TAN.FRAME_MODULE.PARAMS)
        self.prop_layer = getattr(prop_modules, config.TAN.PROP_MODULE.NAME)(config.TAN.PROP_MODULE.PARAMS)
        self.fusion_layer = getattr(fusion_modules, config.TAN.FUSION_MODULE.NAME)(config.TAN.FUSION_MODULE.PARAMS)
        self.map_layer = getattr(map_modules, config.TAN.MAP_MODULE.NAME)(config.TAN.MAP_MODULE.PARAMS)
        '''
        self.frame_layer = FrameAvgPool(config.TAN.FRAME_MODULE.PARAMS) #getattr(frame_modules, config.TAN.FRAME_MODULE.NAME)(config.TAN.FRAME_MODULE.PARAMS)
        self.prop_layer = SparsePropMaxPool(config.TAN.PROP_MODULE.PARAMS) #getattr(prop_modules, config.TAN.PROP_MODULE.NAME)(config.TAN.PROP_MODULE.PARAMS)
        self.fusion_layer = BaseFusion(config.TAN.FUSION_MODULE.PARAMS) #getattr(fusion_modules, config.TAN.FUSION_MODULE.NAME)(config.TAN.FUSION_MODULE.PARAMS)
        self.map_layer = MapConv(config.TAN.MAP_MODULE.PARAMS) #getattr(map_modules, config.TAN.MAP_MODULE.NAME)(config.TAN.MAP_MODULE.PARAMS)
        
        self.pred_layer = nn.Conv2d(config.TAN.PRED_INPUT_SIZE, 1, 1, 1)

    def forward(self, textual_input, textual_mask, visual_input):

        vis_h = self.frame_layer(visual_input.transpose(1, 2))
        map_h, map_mask = self.prop_layer(vis_h)
        fused_h = self.fusion_layer(textual_input, textual_mask, map_h, map_mask)
        fused_h = self.map_layer(fused_h, map_mask)
        prediction = self.pred_layer(fused_h) * map_mask

        return prediction, map_mask

    def extract_features(self, textual_input, textual_mask, visual_input):
        vis_h = self.frame_layer(visual_input.transpose(1, 2))
        map_h, map_mask = self.prop_layer(vis_h)

        fused_h = self.fusion_layer(textual_input, textual_mask, map_h, map_mask)
        fused_h = self.map_layer(fused_h, map_mask)
        prediction = self.pred_layer(fused_h) * map_mask

        return fused_h, prediction, map_mask

**MAIN**

In [12]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

#import _init_paths
import os
import pprint
import argparse
import numpy as np
import torch
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
import torch.optim as optim
from tqdm import tqdm
'''import datasets
import models
from core.config import config, update_config
from core.engine import Engine
from core.utils import AverageMeter
from core import eval
from core.utils import create_logger
import models.loss as loss'''
import math
PATH = Path()


'''
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.autograd.set_detect_anomaly(True)
def reset_config(config, args):
    if args.gpus:
        config.GPUS = 0 #args.gpus
    if args.workers:
        config.WORKERS = 1 #args.workers
    if args.dataDir:
        config.DATA_DIR = #args.dataDir
    if args.modelDir:
        config.MODEL_DIR = #args.modelDir
    if args.logDir:
        config.LOG_DIR = #args.logDir
    if args.verbose:
        config.VERBOSE = #args.verbose
    if args.tag:
        config.TAG = #args.tag
reset_config(config, _)
logger, final_output_dir = create_logger(config, args.cfg, config.TAG)
logger.info('\n'+pprint.pformat(args))
logger.info('\n'+pprint.pformat(config))
'''
update_config(PATH.config_init_1())

# cudnn related setting
cudnn.benchmark = config.CUDNN.BENCHMARK
torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
torch.backends.cudnn.enabled = config.CUDNN.ENABLED

dataset_name = config.DATASET.NAME
model_name = config.MODEL.NAME

train_dataset = Charades(0) #getattr(datasets, dataset_name)('train')
if config.TEST.EVAL_TRAIN:
    eval_train_dataset = Charades(0) #getattr(datasets, dataset_name)('train')
if not config.DATASET.NO_VAL:
    val_dataset = Charades(1) #getattr(datasets, dataset_name)('val')
test_dataset = Charades(1) #getattr(datasets, dataset_name)('test')

model = TAN() #getattr(models, model_name)()
if config.MODEL.CHECKPOINT and config.TRAIN.CONTINUE:
    model_checkpoint = torch.load(config.MODEL.CHECKPOINT)
    model.load_state_dict(model_checkpoint)
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs")
    model = torch.nn.DataParallel(model)
device = ("cuda" if torch.cuda.is_available() else "cpu" )
model = model.to(device)

optimizer = optim.Adam(model.parameters(),lr=config.TRAIN.LR, betas=(0.9, 0.999), weight_decay=config.TRAIN.WEIGHT_DECAY)
# optimizer = optim.SGD(model.parameters(), lr=config.TRAIN.LR, momentum=0.9, weight_decay=config.TRAIN.WEIGHT_DECAY)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=config.TRAIN.FACTOR, patience=config.TRAIN.PATIENCE, verbose=config.VERBOSE)

def iterator(split):
    if split == 'train':
        dataloader = DataLoader(train_dataset,
                                batch_size=config.TRAIN.BATCH_SIZE,
                                shuffle=False,#config.TRAIN.SHUFFLE,
                                num_workers=config.WORKERS,
                                pin_memory=False,
                                collate_fn=collate_fn)#datasets.collate_fn)
    elif split == 'val':
        dataloader = DataLoader(val_dataset,
                                batch_size=config.TEST.BATCH_SIZE,
                                shuffle=False,
                                num_workers=config.WORKERS,
                                pin_memory=False,
                                collate_fn=collate_fn)#datasets.collate_fn)
    elif split == 'test':
        dataloader = DataLoader(test_dataset,
                                batch_size=config.TEST.BATCH_SIZE,
                                shuffle=False,
                                num_workers=config.WORKERS,
                                pin_memory=False,
                                collate_fn=collate_fn)#datasets.collate_fn)
    elif split == 'train_no_shuffle':
        dataloader = DataLoader(eval_train_dataset,
                                batch_size=config.TEST.BATCH_SIZE,
                                shuffle=False,
                                num_workers=config.WORKERS,
                                pin_memory=False,
                                collate_fn=collate_fn)#datasets.collate_fn)
    else:
        raise NotImplementedError

    return dataloader

def network(sample):
    print('network')
    anno_idxs = sample['batch_anno_idxs']
    textual_input = sample['batch_word_vectors'].cuda()
    textual_mask = sample['batch_txt_mask'].cuda()
    visual_input = sample['batch_vis_input'].cuda()
    map_gt = sample['batch_map_gt'].cuda()
    duration = sample['batch_duration']

    prediction, map_mask = model(textual_input, textual_mask, visual_input)
    loss_value, joint_prob = bce_rescale_loss(prediction, map_mask, map_gt, config.LOSS.PARAMS) #getattr(loss, config.LOSS.NAME)(prediction, map_mask, map_gt, config.LOSS.PARAMS)

    sorted_times = None if model.training else get_proposal_results(joint_prob, duration)

    return loss_value, sorted_times

def get_proposal_results(scores, durations):
    # assume all valid scores are larger than one
    out_sorted_times = []
    for score, duration in zip(scores, durations):
        T = score.shape[-1]
        sorted_indexs = np.dstack(np.unravel_index(np.argsort(score.cpu().detach().numpy().ravel())[::-1], (T, T))).tolist()
        sorted_indexs = np.array([item for item in sorted_indexs[0] if item[0] <= item[1]]).astype(float)

        sorted_indexs[:,1] = sorted_indexs[:,1] + 1
        sorted_indexs = torch.from_numpy(sorted_indexs).cuda()
        target_size = config.DATASET.NUM_SAMPLE_CLIPS // config.DATASET.TARGET_STRIDE
        out_sorted_times.append((sorted_indexs.float() / target_size * duration).tolist())

    return out_sorted_times

def on_start(state):
    print('on_start')
    state['loss_meter'] = AverageMeter()
    state['test_interval'] = int(len(train_dataset)/config.TRAIN.BATCH_SIZE*config.TEST.INTERVAL)
    state['t'] = 1
    model.train()
    if config.VERBOSE:
        state['progress_bar'] = tqdm(total=state['test_interval'])

def on_forward(state):
    torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
    state['loss_meter'].update(state['loss'].item(), 1)

def on_update(state):# Save All
    if config.VERBOSE:
        state['progress_bar'].update(1)

    if state['t'] % state['test_interval'] == 0:
        model.eval()
        if config.VERBOSE:
            state['progress_bar'].close()

        loss_message = '\niter: {} train loss {:.4f}'.format(state['t'], state['loss_meter'].avg)
        table_message = ''
        if config.TEST.EVAL_TRAIN:
            train_state = engine.test(network, iterator('train_no_shuffle'), 'train')
            train_table = display_results(train_state['Rank@N,mIoU@M'], train_state['miou'],'performance on training set')#eval.
            table_message += '\n'+ train_table
        if not config.DATASET.NO_VAL:
            val_state = engine.test(network, iterator('val'), 'val')
            state['scheduler'].step(-val_state['loss_meter'].avg)
            loss_message += ' val loss {:.4f}'.format(val_state['loss_meter'].avg)
            val_state['loss_meter'].reset()
            val_table = display_results(val_state['Rank@N,mIoU@M'], val_state['miou'],'performance on validation set')#eval.
            table_message += '\n'+ val_table

        test_state = engine.test(network, iterator('test'), 'test')
        loss_message += ' test loss {:.4f}'.format(test_state['loss_meter'].avg)
        test_state['loss_meter'].reset()
        test_table = display_results(test_state['Rank@N,mIoU@M'], test_state['miou'],'performance on testing set')#eval.
        table_message += '\n' + test_table

        message = loss_message+table_message+'\n'
        logger.info(message)

        saved_model_filename = os.path.join(config.MODEL_DIR,'{}/{}/iter{:06d}-{:.4f}-{:.4f}.pkl'.format(
            dataset_name, model_name+'_'+config.DATASET.VIS_INPUT_TYPE,
            state['t'], test_state['Rank@N,mIoU@M'][0,0], test_state['Rank@N,mIoU@M'][0,1]))

        rootfolder1 = os.path.dirname(saved_model_filename)
        rootfolder2 = os.path.dirname(rootfolder1)
        rootfolder3 = os.path.dirname(rootfolder2)
        if not os.path.exists(rootfolder3):
            print('Make directory %s ...' % rootfolder3)
            os.mkdir(rootfolder3)
        if not os.path.exists(rootfolder2):
            print('Make directory %s ...' % rootfolder2)
            os.mkdir(rootfolder2)
        if not os.path.exists(rootfolder1):
            print('Make directory %s ...' % rootfolder1)
            os.mkdir(rootfolder1)

        if torch.cuda.device_count() > 1:
            torch.save(model.module.state_dict(), saved_model_filename)
        else:
            torch.save(model.state_dict(), saved_model_filename)


        if config.VERBOSE:
            state['progress_bar'] = tqdm(total=state['test_interval'])
        model.train()
        state['loss_meter'].reset()

def on_end(state):
    if config.VERBOSE:
        state['progress_bar'].close()


def on_test_start(state):
    state['loss_meter'] = AverageMeter()
    state['sorted_segments_list'] = []
    if config.VERBOSE:
        if state['split'] == 'train':
            state['progress_bar'] = tqdm(total=math.ceil(len(train_dataset)/config.TEST.BATCH_SIZE))
        elif state['split'] == 'val':
            state['progress_bar'] = tqdm(total=math.ceil(len(val_dataset)/config.TEST.BATCH_SIZE))
        elif state['split'] == 'test':
            state['progress_bar'] = tqdm(total=math.ceil(len(test_dataset)/config.TEST.BATCH_SIZE))
        else:
            raise NotImplementedError

def on_test_forward(state):
    if config.VERBOSE:
        state['progress_bar'].update(1)
    state['loss_meter'].update(state['loss'].item(), 1)

    min_idx = min(state['sample']['batch_anno_idxs'])
    batch_indexs = [idx - min_idx for idx in state['sample']['batch_anno_idxs']]
    sorted_segments = [state['output'][i] for i in batch_indexs]
    state['sorted_segments_list'].extend(sorted_segments)

def on_test_end(state):
    annotations = state['iterator'].dataset.annotations
    state['Rank@N,mIoU@M'], state['miou'] = eval_predictions(state['sorted_segments_list'], annotations, verbose=False)#eval.eval_predictions(state['sorted_segments_list'], annotations, verbose=False)
    if config.VERBOSE:
        state['progress_bar'].close()


engine = Engine()
engine.hooks['on_start'] = on_start
engine.hooks['on_forward'] = on_forward
engine.hooks['on_update'] = on_update
engine.hooks['on_end'] = on_end
engine.hooks['on_test_start'] = on_test_start
engine.hooks['on_test_forward'] = on_test_forward
engine.hooks['on_test_end'] = on_test_end
engine.train(network,iterator('train'),maxepoch=config.TRAIN.MAX_EPOCH,optimizer=optimizer,scheduler=scheduler)


on_start
EPOCH 0/100
network
network
network
network
network
[['Rank@1,mIoU@0.5', 'Rank@1,mIoU@0.7', 'Rank@5,mIoU@0.5', 'Rank@5,mIoU@0.7', 'mIoU'], ['1.18', '0.00', '21.18', '8.24', '4.60']]


NameError: name 'table' is not defined