In [2]:
import os
import sys
import time
import re
import argparse
import numpy as np
from tqdm import tqdm
from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt
plt.switch_backend('agg')

sys.path.append('../utils')
from dataset_3d import *
from model_3d import *
from resnet_2d3d import neq_load_customized
from augmentation import *
from utils import AverageMeter, save_checkpoint, denorm, calc_topk_accuracy

import torch
import torch.optim as optim
from torch.utils import data
from torchvision import datasets, models, transforms
import torchvision.utils as vutils
torch.backends.cudnn.benchmark = True

In [17]:
parser = argparse.ArgumentParser()
parser.add_argument('--net', default='resnet18', type=str)
parser.add_argument('--model', default='dpc-rnn', type=str)
parser.add_argument('--dataset', default='ucf101', type=str)
parser.add_argument('--seq_len', default=5, type=int, help='number of frames in each video block')
parser.add_argument('--num_seq', default=8, type=int, help='number of video blocks')
parser.add_argument('--pred_step', default=2, type=int)
parser.add_argument('--ds', default=4, type=int, help='frame downsampling rate')
parser.add_argument('--batch_size', default=64, type=int)
parser.add_argument('--lr', default=1e-3, type=float, help='learning rate')
parser.add_argument('--wd', default=1e-5, type=float, help='weight decay')
parser.add_argument('--resume', default='', type=str, help='path of model to resume')
parser.add_argument('--pretrain', default='', type=str, help='path of pretrained model')
parser.add_argument('--epochs', default=10, type=int, help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, help='manual epoch number (useful on restarts)')
parser.add_argument('--gpu', default='0', type=str)
parser.add_argument('--print_freq', default=5, type=int, help='frequency of printing output during training')
parser.add_argument('--reset_lr', action='store_true', help='Reset learning rate when resume training?')
parser.add_argument('--prefix', default='tmp', type=str, help='prefix of checkpoint filename')
parser.add_argument('--train_what', default='all', type=str)
parser.add_argument('--img_dim', default=128, type=int)

_StoreAction(option_strings=['--img_dim'], dest='img_dim', nargs=None, const=None, default=128, type=<class 'int'>, choices=None, help=None, metavar=None)

In [19]:
torch.manual_seed(0)
np.random.seed(0)
global args; args = parser.parse_known_args()[0]
os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu)
global cuda; cuda = torch.device('cuda')

best_acc = 0
global iteration; iteration = 0
args.old_lr = None

def get_data(transform, mode='train'):
    print('Loading data for "%s" ...' % mode)
    if args.dataset == 'k400':
        use_big_K400 = args.img_dim > 140
        dataset = Kinetics400_full_3d(mode=mode,
                              transform=transform,
                              seq_len=args.seq_len,
                              num_seq=args.num_seq,
                              downsample=5,
                              big=use_big_K400)
    elif args.dataset == 'ucf101':
        dataset = UCF101_3d(mode=mode,
                         transform=transform,
                         seq_len=args.seq_len,
                         num_seq=args.num_seq,
                         downsample=args.ds)
    else:
        raise ValueError('dataset not supported')

    sampler = data.RandomSampler(dataset)

    if mode == 'train':
        data_loader = data.DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      sampler=sampler,
                                      shuffle=False,
                                      num_workers=4,
                                      pin_memory=True,
                                      drop_last=True)
    elif mode == 'val':
        data_loader = data.DataLoader(dataset,
                                      batch_size=args.batch_size,
                                      sampler=sampler,
                                      shuffle=False,
                                      num_workers=4,
                                      pin_memory=True,
                                      drop_last=True)
    print('"%s" dataset size: %d' % (mode, len(dataset)))
    return data_loader, dataset

def set_path(args):
    if args.resume: exp_path = os.path.dirname(os.path.dirname(args.resume))
    else:
        exp_path = 'log_{args.prefix}/{args.dataset}-{args.img_dim}_{0}_{args.model}_\
bs{args.batch_size}_lr{1}_seq{args.num_seq}_pred{args.pred_step}_len{args.seq_len}_ds{args.ds}_\
train-{args.train_what}{2}'.format(
                    'r%s' % args.net[6::], \
                    args.old_lr if args.old_lr is not None else args.lr, \
                    '_pt=%s' % args.pretrain.replace('/','-') if args.pretrain else '', \
                    args=args)
    img_path = os.path.join(exp_path, 'img')
    model_path = os.path.join(exp_path, 'model')
    if not os.path.exists(img_path): os.makedirs(img_path)
    if not os.path.exists(model_path): os.makedirs(model_path)
    return img_path, model_path

def process_output(mask):
    '''task mask as input, compute the target for contrastive loss'''
    # dot product is computed in parallel gpus, so get less easy neg, bounded by batch size in each gpu'''
    # mask meaning: -2: omit, -1: temporal neg (hard), 0: easy neg, 1: pos, -3: spatial neg
    (B, NP, SQ, B2, NS, _) = mask.size() # [B, P, SQ, B, N, SQ]
    target = mask == 1
    target.requires_grad = False
    return target, (B, B2, NS, NP, SQ)

In [20]:
# transformation for input (for contrastive learning)
# from main import *

transform = transforms.Compose([
            RandomHorizontalFlip(consistent=True),
            RandomCrop(size=224, consistent=True),
            Scale(size=(args.img_dim,args.img_dim)),
            RandomGray(consistent=False, p=0.5),
            ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25, p=1.0),
            ToTensor(),
            Normalize()
        ])

train_loader, dataset = get_data(transform, 'train')

global de_normalize; de_normalize = denorm()
global img_path; img_path, model_path = set_path(args)
global writer_train
try: # old version
    writer_val = SummaryWriter(log_dir=os.path.join(img_path, 'val'))
    writer_train = SummaryWriter(log_dir=os.path.join(img_path, 'train'))
except: # v1.7
    writer_val = SummaryWriter(logdir=os.path.join(img_path, 'val'))
    writer_train = SummaryWriter(logdir=os.path.join(img_path, 'train'))

Loading data for "train" ...
"train" dataset size: 5078


In [21]:
dataset[0].shape

torch.Size([8, 3, 5, 128, 128])

In [22]:
input = None

for idx, input_seq in tqdm(enumerate(train_loader), total=len(train_loader)):
    print(idx)
    print(input_seq.shape)
    input = input_seq
    break

  0%|          | 0/79 [00:12<?, ?it/s]

0
torch.Size([64, 8, 3, 5, 128, 128])





In [58]:
import sys
import time
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
sys.path.append('../backbone')
from select_backbone import select_resnet
from convrnn import ConvGRU


class DPC_RNN(nn.Module):
    '''DPC with RNN'''
    def __init__(self, sample_size, num_seq=8, seq_len=5, pred_step=3, network='resnet50'):
        super(DPC_RNN, self).__init__()
        
        if torch.cuda.is_available():
            device = torch.device('cuda:0')
        else:
            device = torch.device('cpu')
        
        torch.cuda.manual_seed(233)
        print('Using DPC-RNN model')
        self.sample_size = sample_size
        self.num_seq = num_seq
        self.seq_len = seq_len
        self.pred_step = pred_step
        self.last_duration = int(math.ceil(seq_len / 4))
        self.last_size = int(math.ceil(sample_size / 32))
        print('final feature map has size %dx%d' % (self.last_size, self.last_size))

        self.backbone, self.param = select_resnet(network, track_running_stats=False)
        self.param['num_layers'] = 1 # param for GRU
        self.param['hidden_size'] = self.param['feature_size'] # param for GRU

        self.agg = ConvGRU(input_size=self.param['feature_size'],
                               hidden_size=self.param['hidden_size'],
                               kernel_size=1,
                               num_layers=self.param['num_layers'])
        self.network_pred = nn.Sequential(
                                nn.Conv2d(self.param['feature_size'], self.param['feature_size'], kernel_size=1, padding=0),
                                nn.ReLU(inplace=True),
                                nn.Conv2d(self.param['feature_size'], self.param['feature_size'], kernel_size=1, padding=0)
                                )
        self.mask = None
        self.relu = nn.ReLU(inplace=False)
        self._initialize_weights(self.agg)
        self._initialize_weights(self.network_pred)
        self.to(device)

    def forward(self, block):
        # block: [B, N, C, SL, W, H]
        ### extract feature ###
        '''
        B : batch size
        N : number of sequences
        C : channels of each images
        SL : length of sequence
        W, H : size of images
        '''
        (B, N, C, SL, H, W) = block.shape
        print(f'shape of the raw input block : {block.shape}')
        block = block.view(B*N, C, SL, H, W)
        print(f'shape of block after reshaping : {block.shape}')
        feature = self.backbone(block)
        print(f'shape of the latent vector after ResNet processing : {feature.shape}')
        del block
        feature = F.avg_pool3d(feature, (self.last_duration, 1, 1), stride=(1, 1, 1))

        feature_inf_all = feature.view(B, N, self.param['feature_size'], self.last_size, self.last_size) # before ReLU, (-inf, +inf)
        feature = self.relu(feature) # [0, +inf)
        feature = feature.view(B, N, self.param['feature_size'], self.last_size, self.last_size) # [B,N,D,6,6], [0, +inf)
        feature_inf = feature_inf_all[:, N-self.pred_step::, :].contiguous()
        
        del feature_inf_all

        ### aggregate, predict future ###
        # aggregate previous information
        _, hidden = self.agg(feature[:, 0:N-self.pred_step, :].contiguous())
        hidden = hidden[:,-1,:] # after tanh, (-1,1). get the hidden state of last layer, last time step
        
        
        # predict the future
        pred = []
        for i in range(self.pred_step):
            # sequentially pred future
            p_tmp = self.network_pred(hidden)
            pred.append(p_tmp)
            _, hidden = self.agg(self.relu(p_tmp).unsqueeze(1), hidden.unsqueeze(0))
            hidden = hidden[:,-1,:]
        pred = torch.stack(pred, 1) # B, pred_step, xxx
        del hidden


        ### Get similarity score ###
        # pred: [B, pred_step, D, last_size, last_size]
        # GT: [B, N, D, last_size, last_size]
        N = self.pred_step
        # dot product D dimension in pred-GT pair, get a 6d tensor. First 3 dims are from pred, last 3 dims are from GT. 
        pred = pred.permute(0,1,3,4,2).contiguous().view(B*self.pred_step*self.last_size**2, self.param['feature_size'])
        feature_inf = feature_inf.permute(0,1,3,4,2).contiguous().view(B*N*self.last_size**2, self.param['feature_size']).transpose(0,1)
        score = torch.matmul(pred, feature_inf).view(B, self.pred_step, self.last_size**2, B, N, self.last_size**2)
        del feature_inf, pred

        if self.mask is None: # only compute mask once
            # mask meaning: -2: omit, -1: temporal neg (hard), 0: easy neg, 1: pos, -3: spatial neg
            mask = torch.zeros((B, self.pred_step, self.last_size**2, B, N, self.last_size**2), dtype=torch.int8, requires_grad=False).detach().cuda()
            mask[torch.arange(B), :, :, torch.arange(B), :, :] = -3 # spatial neg
            for k in range(B):
                mask[k, :, torch.arange(self.last_size**2), k, :, torch.arange(self.last_size**2)] = -1 # temporal neg
            tmp = mask.permute(0, 2, 1, 3, 5, 4).contiguous().view(B*self.last_size**2, self.pred_step, B*self.last_size**2, N)
            for j in range(B*self.last_size**2):
                tmp[j, torch.arange(self.pred_step), j, torch.arange(N-self.pred_step, N)] = 1 # pos
            mask = tmp.view(B, self.last_size**2, self.pred_step, B, self.last_size**2, N).permute(0,2,1,3,5,4)
            self.mask = mask

        return [score, self.mask]

    def _initialize_weights(self, module):
        for name, param in module.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.orthogonal_(param, 1)
        # other resnet weights have been initialized in resnet itself

    def reset_mask(self):
        self.mask = None

In [59]:
### dpc model ###
if args.model == 'dpc-rnn':
    model = DPC_RNN(sample_size=args.img_dim, 
                    num_seq=args.num_seq, 
                    seq_len=args.seq_len, 
                    network=args.net, 
                    pred_step=args.pred_step)
else: raise ValueError('wrong model!')
    
model = nn.DataParallel(model)
model = model.to(cuda)
global criterion; criterion = nn.CrossEntropyLoss()

### optimizer ###
if args.train_what == 'last':
    for name, param in model.module.resnet.named_parameters():
        param.requires_grad = False
else: pass # train all layers

print('\n===========Check Grad============')
for name, param in model.named_parameters():
    print(name, param.requires_grad)
print('=================================\n')

params = model.parameters()
optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd)
args.old_lr = None

Using DPC-RNN model
final feature map has size 4x4

module.backbone.conv1.weight True
module.backbone.bn1.weight True
module.backbone.bn1.bias True
module.backbone.layer1.0.conv1.weight True
module.backbone.layer1.0.bn1.weight True
module.backbone.layer1.0.bn1.bias True
module.backbone.layer1.0.conv2.weight True
module.backbone.layer1.0.bn2.weight True
module.backbone.layer1.0.bn2.bias True
module.backbone.layer1.1.conv1.weight True
module.backbone.layer1.1.bn1.weight True
module.backbone.layer1.1.bn1.bias True
module.backbone.layer1.1.conv2.weight True
module.backbone.layer1.1.bn2.weight True
module.backbone.layer1.1.bn2.bias True
module.backbone.layer2.0.conv1.weight True
module.backbone.layer2.0.bn1.weight True
module.backbone.layer2.0.bn1.bias True
module.backbone.layer2.0.conv2.weight True
module.backbone.layer2.0.bn2.weight True
module.backbone.layer2.0.bn2.bias True
module.backbone.layer2.0.downsample.0.weight True
module.backbone.layer2.0.downsample.1.weight True
module.backbon

In [75]:
a, b = train_loader.dataset.video_info.iloc[0]
print(a)

/home/yiranwang/Workplace/DPC/UCF101/frame/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c03/


In [60]:
score_, mask_ = model(input)
print(score.shape)
print(mask.shape)

target_, (_, B2, NS, NP, SQ) = process_output(mask_)

B = input.size(0)

score_flattened = score_.view(B*NP*SQ, B2*NS*SQ)
# print(f'target shape : {target_.shape}')
# print(B, NP, SQ, B2, NS)
target_flattened = target_.contiguous().view(B*NP*SQ, B2*NS*SQ)
# print(target_flattened)
target_flattened = target_flattened.double().argmax(dim=1)

loss = criterion(score_flattened, target_flattened)

top1, top3, top5 = calc_topk_accuracy(score_flattened, target_flattened, (1,3,5))

# the latent representation is of shape [batch-size, out-channels, D(third dimension), H, W]

shape of the raw input block : torch.Size([4, 8, 3, 5, 128, 128])
shape of block after reshaping : torch.Size([32, 3, 5, 128, 128])
shape of the latent vector after ResNet processing : torch.Size([32, 256, 2, 4, 4])
torch.Size([4, 3, 16, 4, 3, 16])
torch.Size([4, 3, 16, 4, 3, 16])


In [34]:
print(score_flattened.shape)
print(target_flattened.shape)
print(target_flattened)
print((mask_==1).shape)
print(target_.shape)

torch.Size([192, 192])
torch.Size([192])
tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 