In [2]:
import sys
import time
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
sys.path.append('../backbone')
from select_backbone import select_resnet
from convrnn import ConvGRU

In [2]:

class DPC_RNN(nn.Module):
    '''DPC with RNN'''
    def __init__(self, sample_size, num_seq=8, seq_len=5, pred_step=3, network='resnet50'):
        super(DPC_RNN, self).__init__()
        torch.cuda.manual_seed(233)
        print('Using DPC-RNN model')
        self.sample_size = sample_size
        self.num_seq = num_seq
        self.seq_len = seq_len
        self.pred_step = pred_step
        self.last_duration = int(math.ceil(seq_len / 4))
        self.last_size = int(math.ceil(sample_size / 32))
        print('final feature map has size %dx%d' % (self.last_size, self.last_size))

        self.backbone, self.param = select_resnet(network, track_running_stats=False)
        self.param['num_layers'] = 1 # param for GRU
        self.param['hidden_size'] = self.param['feature_size'] # param for GRU

        self.agg = ConvGRU(input_size=self.param['feature_size'],
                               hidden_size=self.param['hidden_size'],
                               kernel_size=1,
                               num_layers=self.param['num_layers'])
        self.network_pred = nn.Sequential(
                                nn.Conv2d(self.param['feature_size'], self.param['feature_size'], kernel_size=1, padding=0),
                                nn.ReLU(inplace=True),
                                nn.Conv2d(self.param['feature_size'], self.param['feature_size'], kernel_size=1, padding=0)
                                )
        self.mask = None
        self.relu = nn.ReLU(inplace=False)
        self._initialize_weights(self.agg)
        self._initialize_weights(self.network_pred)

    def forward(self, block):
        # block: [B, N, C, SL, W, H]
        ### extract feature ###
        '''
        B : batch size
        N : number of sequences
        C : channels of each images
        SL : length of sequence
        W, H : size of images
        '''
        (B, N, C, SL, H, W) = block.shape
        block = block.view(B*N, C, SL, H, W)
        feature = self.backbone(block)
#         print(feature.shape)
        del block
        feature = F.avg_pool3d(feature, (self.last_duration, 1, 1), stride=(1, 1, 1))

        feature_inf_all = feature.view(B, N, self.param['feature_size'], self.last_size, self.last_size) # before ReLU, (-inf, +inf)
        feature = self.relu(feature) # [0, +inf)
        feature = feature.view(B, N, self.param['feature_size'], self.last_size, self.last_size) # [B,N,D,6,6], [0, +inf)
        feature_inf = feature_inf_all[:, N-self.pred_step::, :].contiguous()
        del feature_inf_all

        ### aggregate, predict future ###
        # aggregate previous information
        _, hidden = self.agg(feature[:, 0:N-self.pred_step, :].contiguous())
        hidden = hidden[:,-1,:] # after tanh, (-1,1). get the hidden state of last layer, last time step
        
        # predict the future
        pred = []
        for i in range(self.pred_step):
            # sequentially pred future
            p_tmp = self.network_pred(hidden)
            pred.append(p_tmp)
            _, hidden = self.agg(self.relu(p_tmp).unsqueeze(1), hidden.unsqueeze(0))
            hidden = hidden[:,-1,:]
        pred = torch.stack(pred, 1) # B, pred_step, xxx
        del hidden


        ### Get similarity score ###
        # pred: [B, pred_step, D, last_size, last_size]
        # GT: [B, N, D, last_size, last_size]
        N = self.pred_step
        # dot product D dimension in pred-GT pair, get a 6d tensor. First 3 dims are from pred, last 3 dims are from GT. 
        pred = pred.permute(0,1,3,4,2).contiguous().view(B*self.pred_step*self.last_size**2, self.param['feature_size'])
        feature_inf = feature_inf.permute(0,1,3,4,2).contiguous().view(B*N*self.last_size**2, self.param['feature_size']).transpose(0,1)
        score = torch.matmul(pred, feature_inf).view(B, self.pred_step, self.last_size**2, B, N, self.last_size**2)
        del feature_inf, pred

        if self.mask is None: # only compute mask once
            # mask meaning: -2: omit, -1: temporal neg (hard), 0: easy neg, 1: pos, -3: spatial neg
            mask = torch.zeros((B, self.pred_step, self.last_size**2, B, N, self.last_size**2), dtype=torch.int8, requires_grad=False).detach().cuda()
            mask[torch.arange(B), :, :, torch.arange(B), :, :] = -3 # spatial neg
            for k in range(B):
                mask[k, :, torch.arange(self.last_size**2), k, :, torch.arange(self.last_size**2)] = -1 # temporal neg
            tmp = mask.permute(0, 2, 1, 3, 5, 4).contiguous().view(B*self.last_size**2, self.pred_step, B*self.last_size**2, N)
            for j in range(B*self.last_size**2):
                tmp[j, torch.arange(self.pred_step), j, torch.arange(N-self.pred_step, N)] = 1 # pos
            mask = tmp.view(B, self.last_size**2, self.pred_step, B, self.last_size**2, N).permute(0,2,1,3,5,4)
            self.mask = mask

        return [score, self.mask]

    def _initialize_weights(self, module):
        for name, param in module.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.orthogonal_(param, 1)
        # other resnet weights have been initialized in resnet itself

    def reset_mask(self):
        self.mask = None

In [1]:
from dataset_atari import Atari
import gym
import sys
sys.path.append('../d4rl-atari')
import d4rl_atari

try:
    del env
except:
    pass
env = gym.make('boxing-expert-v0', stack=True)
dataset = env.get_dataset(n_channels=4)

atari = Atari(dataset=dataset)

loading /home/yiranwang/.d4rl/datasets/Boxing/1/50/observation.gz...
loading /home/yiranwang/.d4rl/datasets/Boxing/1/50/action.gz...
loading /home/yiranwang/.d4rl/datasets/Boxing/1/50/reward.gz...
loading /home/yiranwang/.d4rl/datasets/Boxing/1/50/terminal.gz...
total trajactories : 969


In [5]:
import numpy as np

a = np.array([1, 2, 3, 4, 5, 6, 7],dtype=np.int)
i = np.array([2, 4, 5])
print(a)
print(a[i])

[1 2 3 4 5 6 7]
[3 5 6]


In [4]:
atari = Atari(dataset=dataset, return_actions=True)

total trajactories : 969


In [4]:
actions = dataset['actions']
actions.shape

print(type(actions))

observations = dataset['observations']

<class 'numpy.ndarray'>


In [9]:
import numpy as np
seq_start = 0

index = np.array(range(seq_start, seq_start + 6, 4), dtype=np.int)


observations[index]

TypeError: only integer scalar arrays can be converted to a scalar index

In [10]:
print(type(observations))
print(type(observations[0]))

<class 'list'>
<class 'numpy.ndarray'>


In [6]:
atari.__getitem__(index=0)[1].shape

torch.Size([0])

In [2]:
import numpy as np
a = np.array(range(20))
print(a)

print(a[1:20:3])

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[ 1  4  7 10 13 16 19]


In [6]:
import os
import sys
import time
import re
import argparse
import numpy as np
from tqdm import tqdm
from tensorboardX import SummaryWriter
import matplotlib.pyplot as plt
plt.switch_backend('agg')

sys.path.append('../utils')
from dataset_3d import *
from model_3d import *
from resnet_2d3d import neq_load_customized
from augmentation import *
from utils import AverageMeter, save_checkpoint, denorm, calc_topk_accuracy

import torch
import torch.optim as optim
from torch.utils import data
from torchvision import datasets, models, transforms
import torchvision.utils as vutils
torch.backends.cudnn.benchmark = True

In [7]:
parser = argparse.ArgumentParser()
parser.add_argument('--net', default='resnet18', type=str)
parser.add_argument('--model', default='dpc-rnn', type=str)
parser.add_argument('--dataset', default='ucf101', type=str)
parser.add_argument('--seq_len', default=5, type=int, help='number of frames in each video block')
parser.add_argument('--num_seq', default=8, type=int, help='number of video blocks')
parser.add_argument('--pred_step', default=3, type=int)
parser.add_argument('--ds', default=3, type=int, help='frame downsampling rate')
parser.add_argument('--batch_size', default=4, type=int)
parser.add_argument('--lr', default=1e-3, type=float, help='learning rate')
parser.add_argument('--wd', default=1e-5, type=float, help='weight decay')
parser.add_argument('--resume', default='', type=str, help='path of model to resume')
parser.add_argument('--pretrain', default='', type=str, help='path of pretrained model')
parser.add_argument('--epochs', default=10, type=int, help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, help='manual epoch number (useful on restarts)')
parser.add_argument('--gpu', default='0,1', type=str)
parser.add_argument('--print_freq', default=5, type=int, help='frequency of printing output during training')
parser.add_argument('--reset_lr', action='store_true', help='Reset learning rate when resume training?')
parser.add_argument('--prefix', default='tmp', type=str, help='prefix of checkpoint filename')
parser.add_argument('--train_what', default='all', type=str)
parser.add_argument('--img_dim', default=128, type=int)

_StoreAction(option_strings=['--img_dim'], dest='img_dim', nargs=None, const=None, default=128, type=<class 'int'>, choices=None, help=None, metavar=None)

In [8]:
torch.manual_seed(0)
np.random.seed(0)
global args; args = parser.parse_known_args()[0]
os.environ["CUDA_VISIBLE_DEVICES"]=str(args.gpu)
global cuda; cuda = torch.device('cuda')

best_acc = 0
global iteration; iteration = 0

if args.model == 'dpc-rnn':
    model = DPC_RNN(sample_size=args.img_dim, 
                    num_seq=args.num_seq, 
                    seq_len=args.seq_len, 
                    network=args.net, 
                    pred_step=args.pred_step)
else: raise ValueError('wrong model!')

Using DPC-RNN model
final feature map has size 4x4


In [13]:
from torch.utils import data

sampler = data.RandomSampler(dataset)

data_loader = data.DataLoader(atari,
                             batch_size=4,
                             sampler=sampler,
                             shuffle=False,
                             num_workers=4,
                             pin_memory=True,
                             drop_last=True)

In [15]:
for idx, X in enumerate(data_loader):
    data = X
    index = idx
    break
    
print(X.shape)

torch.Size([4, 6, 5, 84, 84])


In [4]:
atari[0].shape

torch.Size([6, 5, 84, 84])