In [1]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import numpy as np
import torch.nn.parallel
import torch.optim

from torch.nn import functional as F
from sklearn.metrics import confusion_matrix
from dataset import TSNDataSet
from models import TSN
from transforms import *
from ops import ConsensusModule
from tqdm.notebook import tqdm
from test_opts import parser

import datasets_video
import pdb
import time
import sys
import os
import easydict

In [2]:
# Options
display(easydict.EasyDict(vars(parser.parse_known_args()[0])))

{'dataset': 'jester',
 'modality': 'RGBFlow',
 'weights': 'pretrained_models/MFF_jester_RGBFlow_BNInception_segment4_3f1c_best.pth.tar',
 'arch': 'BNInception',
 'save_scores': None,
 'test_segments': 4,
 'max_num': -1,
 'test_crops': 1,
 'input_size': 224,
 'num_motion': 3,
 'consensus_type': 'MLP',
 'workers': 0,
 'gpus': None,
 'img_feature_dim': 256,
 'num_set_segments': 1,
 'softmax': 0}

In [3]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [4]:
def eval_video(video_data):
        i, data, label = video_data
        num_crop = args.test_crops

        if args.modality == 'RGB':
            length = 3
        elif args.modality == 'Flow':
            length = 10
        elif args.modality == 'RGBDiff':
            length = 18
        elif args.modality == 'RGBFlow':
            length = 3 + 2 * args.num_motion # 3 rgb channels and 3*2=6 flow channels 
        else:
            raise ValueError("Unknown modality "+args.modality)

        input_var = torch.autograd.Variable(data.view(-1, length, data.size(2), data.size(3)),
                                            volatile=True)
        rst = net(input_var)
        if args.softmax==1:
            # take the softmax to normalize the output to probability
            rst = F.softmax(rst)

        rst = rst.data.cpu().numpy().copy()

        if args.consensus_type in ['MLP']:
            rst = rst.reshape(-1, 1, num_class)
        else:
            rst = rst.reshape((num_crop, args.test_segments, num_class)).mean(axis=0).reshape((args.test_segments, 1, num_class))

        return i, rst, label[0]

In [11]:
def main():
    global args, net, num_class
    args = easydict.EasyDict(vars(parser.parse_known_args()[0]))
    args.batch_size = 1
    
    categories, args.train_list, args.val_list, args.root_path, prefix = datasets_video.return_dataset(args.dataset, args.modality)
    num_class = len(categories)
    
    net = TSN(num_class, args.test_segments if args.consensus_type in ['MLP'] else 1, args.modality,
              base_model=args.arch,
              consensus_type=args.consensus_type,
              img_feature_dim=args.img_feature_dim,
              )

    checkpoint = torch.load(args.weights)
    print('model epoch %d best prec@1: %.2f' % (checkpoint['epoch'], checkpoint['best_prec1']))

    base_dict = {'.'.join(k.split('.')[1:]): v for k,v in list(checkpoint['state_dict'].items())}
    net.load_state_dict(base_dict)

    print(f'Number of test crops: {args.test_crops}')
    if args.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.input_size),
        ])
    elif args.test_crops > 1:
        cropping = torchvision.transforms.Compose([
            GroupOverSample(net.input_size, net.scale_size)
        ])
    else:
        raise ValueError("Only 1 and 10 crops are supported while we got {}".format(args.test_crops))

    ############ Data Loading Part #####
    if args.modality == 'RGB':
        data_length = 1
    elif args.modality in ['Flow', 'RGBDiff']:
        data_length = 5
    elif args.modality == 'RGBFlow':
        data_length = args.num_motion

    data_loader = torch.utils.data.DataLoader(
            TSNDataSet(args.root_path, args.val_list, num_segments=args.test_segments,
                       new_length=data_length,
                       modality=args.modality,
                       image_tmpl=prefix,
                       dataset=args.dataset,
                       test_mode=True,
                       dataset_type='val',
                       transform=torchvision.transforms.Compose([
                           cropping,
                           Stack(roll=(args.arch in
                                       ['BNInception','InceptionV3']), isRGBFlow=(args.modality == 'RGBFlow')),
                           ToTorchFormatTensor(div=(args.arch not in ['BNInception','InceptionV3'])),
                           GroupNormalize(net.input_mean, net.input_std),
                       ])),
                batch_size=args.batch_size,
                shuffle=False,
                num_workers=args.workers,
                pin_memory=False
            )

    if args.gpus is not None:
        devices = [args.gpus[i] for i in range(args.workers)]
    else:
        devices = list(range(args.workers))


    #net = torch.nn.DataParallel(net.cuda(devices[0]), device_ids=devices)
    net = torch.nn.DataParallel(net.cuda())
    net.eval()

    total_num = len(data_loader.dataset)
    output = []

    proc_start_time = time.time()
    max_num = args.max_num if args.max_num > 0 else len(data_loader.dataset)

    top1 = AverageMeter()
    top5 = AverageMeter()

    for i, (data, label) in enumerate(tqdm(data_loader)):
        if i >= max_num:
            break
        rst = eval_video((i, data, label))
        output.append(rst[1:])
        cnt_time = time.time() - proc_start_time
        prec1, prec5 = accuracy(torch.from_numpy(np.mean(rst[1], axis=0)), label, topk=(1, 5))
        top1.update(prec1, 1)
        top5.update(prec5, 1)
        if (i + 1) % 1000 == 0:
            print('video {} done, total {}/{}, average {:.3f} sec/video, moving Prec@1 {:.3f} Prec@5 {:.3f}'.format(i+1, i+1,
                                                                        total_num,
                                                                        float(cnt_time) / (i+1), top1.avg, top5.avg))

    video_pred = [np.argmax(np.mean(x[0], axis=0)) for x in output]

    video_labels = [x[1] for x in output]


    cf = confusion_matrix(video_labels, video_pred).astype(float)

    cls_cnt = cf.sum(axis=1)
    cls_hit = np.diag(cf)

    cls_acc = cls_hit / cls_cnt

    print('-----Evaluation is finished------')
    print('Class Accuracy {:.02f}%'.format(np.mean(cls_acc) * 100))
    print('Overall Prec@1 {:.02f}% Prec@5 {:.02f}%'.format(top1.avg, top5.avg))

    if args.save_scores is not None:

        # reorder before saving
        name_list = [x.strip().split()[0] for x in open(args.val_list)]
        order_dict = {e:i for i, e in enumerate(sorted(name_list))}
        reorder_output = [None] * len(output)
        reorder_label = [None] * len(output)
        reorder_pred = [None] * len(output)
        output_csv = []
        for i in range(len(output)):
            idx = order_dict[name_list[i]]
            reorder_output[idx] = output[i]
            reorder_label[idx] = video_labels[i]
            reorder_pred[idx] = video_pred[i]
            output_csv.append('%s;%s'%(name_list[i], categories[video_pred[i]]))

        np.savez(args.save_scores, scores=reorder_output, labels=reorder_label, predictions=reorder_pred, cf=cf)

        with open(args.save_scores.replace('npz','csv'),'w') as f:
            f.write('\n'.join(output_csv))

In [12]:
main()


    Initializing TSN with base model: BNInception.
    TSN Configurations:
        input_modality:     RGBFlow
        num_segments:       4
        new_length:         3
        consensus_module:   MLP
        dropout_ratio:      0.8
        img_feature_dim:    256
            
Converting the ImageNet model to RGB+Flow init model
Done. RGBFlow model ready.
model epoch 38 best prec@1: 92.18
Number of test crops: 1
Found 14786 val videos
Freezing BatchNorm2D except the first one.


  0%|          | 0/14786 [00:00<?, ?it/s]

video 1000 done, total 1000/14786, average 0.142 sec/video, moving Prec@1 89.200 Prec@5 99.000
video 2000 done, total 2000/14786, average 0.142 sec/video, moving Prec@1 89.350 Prec@5 98.950
video 3000 done, total 3000/14786, average 0.144 sec/video, moving Prec@1 89.733 Prec@5 98.900
video 4000 done, total 4000/14786, average 0.145 sec/video, moving Prec@1 90.050 Prec@5 99.075
video 5000 done, total 5000/14786, average 0.142 sec/video, moving Prec@1 90.420 Prec@5 98.980
video 6000 done, total 6000/14786, average 0.140 sec/video, moving Prec@1 90.217 Prec@5 98.900
video 7000 done, total 7000/14786, average 0.138 sec/video, moving Prec@1 90.229 Prec@5 98.843
video 8000 done, total 8000/14786, average 0.136 sec/video, moving Prec@1 90.062 Prec@5 98.838
video 9000 done, total 9000/14786, average 0.134 sec/video, moving Prec@1 90.022 Prec@5 98.833
video 10000 done, total 10000/14786, average 0.133 sec/video, moving Prec@1 89.990 Prec@5 98.850
video 11000 done, total 11000/14786, average 0.1