In [1]:
!pwd

/home/kionkim/work/stat-story/Soft_Decision_Tree


In [1]:
import os
import time
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from tqdm import tqdm, tqdm_notebook

In [2]:
args = {}
args['batch_size'] = 64
args['epochs'] = 40
args['input_dim'] = 784
args['lmbda'] = 0.1
args['log_interval'] = 10
args['lr'] = 0.01
args['max_depth'] = 2
args['momentum'] = 0.5
args['no_cuda'] = False
args['cuda'] = True
args['output_dim'] = 10
args['seed'] = 1

In [3]:
class InnerNode():

    def __init__(self, depth, **kwargs):
        self.args = kwargs
        
        self.fc = nn.Linear(self.args['input_dim'], 1)
        beta = torch.randn(1)
        #beta = beta.expand((self.args.batch_size, 1))
        if self.args['cuda']:
            beta = beta.cuda()
        self.beta = nn.Parameter(beta)
        self.leaf = False
        self.prob = None
        self.leaf_accumulator = []
        self.lmbda = self.args['lmbda'] * 2 ** (-depth)
        self.build_child(depth)
        self.penalties = []

    def reset(self):
        self.leaf_accumulator = []
        self.penalties = []
        self.left.reset()
        self.right.reset()

    def build_child(self, depth):
        if depth < self.args['max_depth']:
            self.left = InnerNode(depth+1, **self.args)
            self.right = InnerNode(depth+1, **self.args)
        else :
            self.left = LeafNode(self.args)
            self.right = LeafNode(self.args)

    def forward(self, x):
        return(F.sigmoid(self.beta*self.fc(x)))
    
    def select_next(self, x):
        prob = self.forward(x)
        if prob < 0.5:
            return(self.left, prob)
        else:
            return(self.right, prob)

    def cal_prob(self, x, path_prob):
        self.data = x
        self.prob = self.forward(x) #probability of selecting right node
        self.path_prob = path_prob
        left_leaf_accumulator = self.left.cal_prob(x, path_prob * (1-self.prob))
        right_leaf_accumulator = self.right.cal_prob(x, path_prob * self.prob)
        self.leaf_accumulator.extend(left_leaf_accumulator)
        self.leaf_accumulator.extend(right_leaf_accumulator)
        return(self.leaf_accumulator)

    def get_penalty(self):
        penalty = (torch.sum(self.prob * self.path_prob) / torch.sum(self.path_prob), self.lmbda)
        if not self.left.leaf:
            left_penalty = self.left.get_penalty()
            right_penalty = self.right.get_penalty()
            self.penalties.append(penalty)
            self.penalties.extend(left_penalty)
            self.penalties.extend(right_penalty)
        return(self.penalties)

In [4]:
class LeafNode():
    def __init__(self, args):
        self.args = args
        self.param = torch.randn(self.args['output_dim'])
        if self.args['cuda']:
            self.param = self.param.cuda()
        self.param = nn.Parameter(self.param)
        self.leaf = True
        self.softmax = nn.Softmax()

    def forward(self):
        return(self.softmax(self.param.view(1,-1)))

    def reset(self):
        pass

    def cal_prob(self, x, path_prob):
        self.data = x
        Q = self.forward()
        #Q = Q.expand((self.args.batch_size, self.args.output_dim))
        Q = Q.expand((path_prob.size()[0], self.args['output_dim']))
        return([[path_prob, Q]])

In [5]:
class SoftDecisionTree(nn.Module):

    def __init__(self, **kwargs):
        super(SoftDecisionTree, self).__init__()
        self.args = kwargs
        self.root = InnerNode(1, **self.args)
        self.collect_parameters() ##collect parameters and modules under root node
        self.optimizer = optim.SGD(self.parameters(), lr=self.args['lr'], momentum=self.args['momentum'])
        self.test_acc = []
        self.define_extras(self.args['batch_size'])
        self.best_accuracy = 0.0

    def define_extras(self, batch_size):
        ##define target_onehot and path_prob_init batch size, because these need to be defined according to batch size, which can be differ
        self.target_onehot = torch.FloatTensor(batch_size, self.args['output_dim'])
        self.target_onehot = Variable(self.target_onehot)
        self.path_prob_init = Variable(torch.ones(batch_size, 1))
        if self.args['cuda']:
            self.target_onehot = self.target_onehot.cuda()
            self.path_prob_init = self.path_prob_init.cuda()
    '''
    def forward(self, x):
        node = self.root
        path_prob = Variable(torch.ones(self.args.batch_size, 1))
        while not node.leaf:
            node, prob = node.select_next(x)
            path_prob *= prob
        return node()
    '''        
    def cal_loss(self, x, y):
        batch_size = y.size()[0]
        leaf_accumulator = self.root.cal_prob(x, self.path_prob_init)
        loss = 0.
        max_prob = [-1. for _ in range(batch_size)]
        max_Q = [torch.zeros(self.args['output_dim']) for _ in range(batch_size)]
        for (path_prob, Q) in leaf_accumulator:
            TQ = torch.bmm(y.view(batch_size, 1, self.args['output_dim']), torch.log(Q).view(batch_size, self.args['output_dim'], 1)).view(-1,1)
            loss += path_prob * TQ
            path_prob_numpy = path_prob.cpu().data.numpy().reshape(-1)
            for i in range(batch_size):
                if max_prob[i] < path_prob_numpy[i]:
                    max_prob[i] = path_prob_numpy[i]
                    max_Q[i] = Q[i]
        loss = loss.mean()
        penalties = self.root.get_penalty()
        C = 0.
        for (penalty, lmbda) in penalties:
            C -= lmbda * 0.5 *(torch.log(penalty) + torch.log(1-penalty))
        output = torch.stack(max_Q)
        self.root.reset() ##reset all stacked calculation
        return(-loss + C, output) ## -log(loss) will always output non, because loss is always below zero. I suspect this is the mistake of the paper?

    def collect_parameters(self):
        nodes = [self.root]
        self.module_list = nn.ModuleList()
        self.param_list = nn.ParameterList()
        while nodes:
            node = nodes.pop(0)
            if node.leaf:
                param = node.param
                self.param_list.append(param)
            else:
                fc = node.fc
                beta = node.beta
                nodes.append(node.right)
                nodes.append(node.left)
                self.param_list.append(beta)
                self.module_list.append(fc)

    def train_(self, train_loader, epoch):
        self.train()
        self.define_extras(self.args['batch_size'])
        for batch_idx, (data, target) in enumerate(train_loader):
            correct = 0
            if self.args['cuda']:
                data, target = data.cuda(), target.cuda()
            #data = data.view(self.args.batch_size,-1)
            target = Variable(target)
            target_ = target.view(-1,1)
            batch_size = target_.size()[0]
            data = data.view(batch_size,-1)
            ##convert int target to one-hot vector
            data = Variable(data)
            if not batch_size == self.args['batch_size']: #because we have to initialize parameters for batch_size, tensor not matches with batch size cannot be trained
                self.define_extras(batch_size)
            self.target_onehot.data.zero_()            
            self.target_onehot.scatter_(1, target_, 1.)
            self.optimizer.zero_grad()

            loss, output = self.cal_loss(data, self.target_onehot)
            loss.backward(retain_variables=True)
            self.optimizer.step()
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct += pred.eq(target.data).cpu().sum()
            accuracy = 100. * correct / len(data)

            if batch_idx % self.args['log_interval'] == 0:
                tqdm.write('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accuracy: {}/{} ({:.4f}%)'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.data[0],
                    correct, len(data),
                    accuracy))

    def test_(self, test_loader, epoch):
        self.eval()
        self.define_extras(self.args['batch_size'])
        test_loss = 0
        correct = 0
        for data, target in test_loader:
            if self.args['cuda']:
                data, target = data.cuda(), target.cuda()
            target = Variable(target)
            target_ = target.view(-1,1)
            batch_size = target_.size()[0]
            data = data.view(batch_size,-1)
            ##convert int target to one-hot vector
            data = Variable(data)
            if not batch_size == self.args['batch_size']: #because we have to initialize parameters for batch_size, tensor not matches with batch size cannot be trained
                self.define_extras(batch_size)
            self.target_onehot.data.zero_()            
            self.target_onehot.scatter_(1, target_, 1.)
            _, output = self.cal_loss(data, self.target_onehot)
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct += pred.eq(target.data).cpu().sum()
        accuracy = 100. * correct / len(test_loader.dataset)
        tqdm.write('\nTest set: Accuracy: {}/{} ({:.4f}%)\n'.format(
            correct, len(test_loader.dataset),
            accuracy))
        self.test_acc.append(accuracy)

        if accuracy > self.best_accuracy:
            self.save_best('./result')
            self.best_accuracy = accuracy

    def save_best(self, path):
        try:
            os.makedirs('./result')
        except:
            print('directory ./result already exists')

        with open(os.path.join(path, 'best_model.pkl'), 'wb') as output_file:
            pickle.dump(self, output_file)

In [6]:
torch.manual_seed(args['seed'])
if args['cuda']:
    torch.cuda.manual_seed(args['seed'])

try:
    os.makedirs('./data')
except:
    print('directory ./data already exists')

directory ./data already exists


In [7]:
import torch
import torch.utils.data
from torchvision import datasets, transforms

kwargs = {'num_workers': 1, 'pin_memory': True} if args['cuda'] else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args['batch_size'], shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('./data', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=args['batch_size'], shuffle=True, **kwargs)


In [8]:
model = SoftDecisionTree(**args)
model

SoftDecisionTree (
  (module_list): ModuleList (
    (0): Linear (784 -> 1)
    (1): Linear (784 -> 1)
    (2): Linear (784 -> 1)
  )
  (param_list): ParameterList (
  )
)

In [125]:
output.data.max(1)

(
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
  0.3449
 [torch.cuda.FloatTensor of size 64 (GPU 0)], 
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
  6
 [torch.cuda.LongTensor of size 64 (GPU 0)])

In [10]:
if args['cuda']:
    model.cuda()

for epoch in tqdm_notebook(range(args['epochs']), desc='epochs'):
    model.train_(train_loader, epoch)
    model.test_(test_loader, epoch)
  
save_result()

A Jupyter Widget




Test set: Accuracy: 3796/10000 (37.9600%)

directory ./result already exists



Test set: Accuracy: 3861/10000 (38.6100%)

directory ./result already exists



Test set: Accuracy: 3874/10000 (38.7400%)

directory ./result already exists



Test set: Accuracy: 3890/10000 (38.9000%)

directory ./result already exists



Test set: Accuracy: 3885/10000 (38.8500%)




Test set: Accuracy: 3893/10000 (38.9300%)

directory ./result already exists



Test set: Accuracy: 3888/10000 (38.8800%)




Test set: Accuracy: 3888/10000 (38.8800%)




Test set: Accuracy: 3884/10000 (38.8400%)


Test set: Accuracy: 3891/10000 (38.9100%)




Test set: Accuracy: 3892/10000 (38.9200%)




Test set: Accuracy: 3886/10000 (38.8600%)




Test set: Accuracy: 3886/10000 (38.8600%)




Test set: Accuracy: 3882/10000 (38.8200%)




Test set: Accuracy: 3891/10000 (38.9100%)




Test set: Accuracy: 3901/10000 (39.0100%)

directory ./result already exists



Test set: Accuracy: 3896/10000 (38.9600%)




Test set: Accuracy: 3890/10000 (38.9000%)


Test set: Accuracy: 3894/10000 (38.9400%)




Test set: Accuracy: 3892/10000 (38.9200%)




Test set: Accuracy: 3895/10000 (38.9500%)




Test set: Accuracy: 3894/10000 (38.9400%)




Test set: Accuracy: 3888/10000 (38.8800%)




Test set: Accuracy: 3891/10000 (38.9100%)




Test set: Accuracy: 3886/10000 (38.8600%)




Test set: Accuracy: 3894/10000 (38.9400%)




Test set: Accuracy: 3890/10000 (38.9000%)




Test set: Accuracy: 3893/10000 (38.9300%)




Test set: Accuracy: 3896/10000 (38.9600%)


Test set: Accuracy: 3890/10000 (38.9000%)




Test set: Accuracy: 3894/10000 (38.9400%)




Test set: Accuracy: 3900/10000 (39.0000%)




Test set: Accuracy: 3894/10000 (38.9400%)




Test set: Accuracy: 3895/10000 (38.9500%)




Test set: Accuracy: 3885/10000 (38.8500%)




Test set: Accuracy: 3893/10000 (38.9300%)




Test set: Accuracy: 3896/10000 (38.9600%)




Test set: Accuracy: 3894/10000 (38.9400%)




Test set: Accuracy: 3891/10000 (38.9100%)


Test set: Accuracy: 3896/10000 (38.9600%)




NameError: name 'save_result' is not defined

#### Examine

In [15]:
# data 가지고 오기
for i, d in enumerate(train_loader):
    dt = d
    if i ==1:
        break

In [16]:
data = dt[0]
target = dt[1]
data = data.cuda()

In [17]:
# Check shape
dt[0].shape #torch.Size([64, 1, 28, 28])
type(dt[0]) # torch.FloatTensor

dt[1].shape # torch.Size([64])
type(dt[1]) #torch.LongTensor

torch.LongTensor

In [18]:
target = Variable(target) # transform into Variable
target.size() #target.shape이 먹히지 않음... torch.Size([64])
type(target) #torch.autograd.variable.Variable

torch.autograd.variable.Variable

In [19]:
target_ = target.view(-1,1) # 2차원에 dimension을 하나 더함
target_.size() #torch.Size([64, 1])

torch.Size([64, 1])

In [20]:
type(target_)

torch.autograd.variable.Variable

In [21]:
batch_size = target_.size()[0] # 64
target_onehot = torch.FloatTensor(batch_size, args['output_dim'])
# NOTE: 평균 torch.mean, 분산 torch.var
target_onehot.shape #torch.Size([64, 10])

target_onehot = Variable(target_onehot)
target_onehot.size() #torch.Size([64, 10])

# Manually zero the gradients after running the backward pass
target_onehot.data.zero_()
# Cuda variable과 CPU variable 구분이 중요
target_onehot.scatter_(1, target_, 1.)
target = target.cuda()

In [22]:
data = data.view(batch_size, -1)
data.size() # data.shape과 같은 결과... torch.Size([64, 784])
data = Variable(data)
data.size() #torch.Size([64, 784])

torch.Size([64, 784])

In [23]:
data #[torch.cuda.FloatTensor of size 64x784 (GPU 0)]
target #[torch.cuda.LongTensor of size 64 (GPU 0)]

Variable containing:
 3
 2
 9
 8
 8
 4
 2
 6
 0
 9
 0
 2
 5
 8
 2
 6
 4
 2
 9
 2
 3
 3
 4
 3
 0
 1
 7
 6
 3
 3
 8
 0
 5
 5
 4
 3
 0
 8
 6
 4
 1
 1
 9
 1
 2
 1
 1
 9
 3
 8
 2
 5
 2
 3
 9
 5
 4
 2
 3
 9
 5
 8
 5
 3
[torch.cuda.LongTensor of size 64 (GPU 0)]

In [61]:
x =data
y = target_onehot
y = y.cuda()

In [102]:
batch_size = y.size()[0]
leaf_accumulator = model.root.cal_prob(x, model.path_prob_init)
loss = 0.
max_prob = [-1. for _ in range(batch_size)]
max_Q = [torch.zeros(args['output_dim']) for _ in range(batch_size)]

In [81]:
Q = leaf_accumulator[0][1]

In [117]:
Q

Variable containing:
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.

In [104]:
path_prob_numpy

array([ 0.1324345 ,  0.21761641,  0.196484  ,  0.03086069,  0.1792316 ,
        0.12814821,  0.06910293,  0.06686758,  0.13109507,  0.05495748,
        0.270551  ,  0.15169108,  0.11298319,  0.089144  ,  0.2516799 ,
        0.22766237,  0.19082129,  0.20981511,  0.24587752,  0.33108386,
        0.08872376,  0.03934519,  0.06537192,  0.1690933 ,  0.11628345,
        0.24548066,  0.29202336,  0.06440445,  0.14188954,  0.1003928 ,
        0.1047959 ,  0.07431797,  0.11409755,  0.09502421,  0.10942503,
        0.1919713 ,  0.07240174,  0.11217692,  0.16305202,  0.17078111,
        0.32150209,  0.12721629,  0.09401006,  0.40673721,  0.14356177,
        0.31448159,  0.25075132,  0.15921967,  0.17536296,  0.04047649,
        0.11600545,  0.16481802,  0.23023495,  0.12552118,  0.20719177,
        0.10737529,  0.22184153,  0.11038724,  0.23680636,  0.23403534,
        0.16647011,  0.11629023,  0.36176962,  0.1995648 ], dtype=float32)

In [103]:
for i in range(batch_size):
        print('i = {} max_prob = {}'.format(i, max_prob[i]))
        print('i = {} path_prob_numpy = {}'.format(i, path_prob_numpy[i]))
        print('i = {} Q = {}'.format(i, Q[i]))
        print('i = {} max_Q = {}'.format(i, max_Q[i]))
        if max_prob[i] < path_prob_numpy[i]:
            max_prob[i] = path_prob_numpy[i]
            max_Q[i] = Q[i]
            

i = 0 max_prob = -1.0
i = 0 path_prob_numpy = 0.13243450224399567
i = 0 Q = Variable containing:
 0.0078
 0.1170
 0.0253
 0.1556
 0.3831
 0.0326
 0.0824
 0.0438
 0.0583
 0.0940
[torch.cuda.FloatTensor of size 10 (GPU 0)]

i = 0 max_Q = 
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 10]

i = 1 max_prob = -1.0
i = 1 path_prob_numpy = 0.21761640906333923
i = 1 Q = Variable containing:
 0.0078
 0.1170
 0.0253
 0.1556
 0.3831
 0.0326
 0.0824
 0.0438
 0.0583
 0.0940
[torch.cuda.FloatTensor of size 10 (GPU 0)]

i = 1 max_Q = 
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 10]

i = 2 max_prob = -1.0
i = 2 path_prob_numpy = 0.19648399949073792
i = 2 Q = Variable containing:
 0.0078
 0.1170
 0.0253
 0.1556
 0.3831
 0.0326
 0.0824
 0.0438
 0.0583
 0.0940
[torch.cuda.FloatTensor of size 10 (GPU 0)]

i = 2 max_Q = 
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 10]

i = 3 max_prob = -1.0
i = 3 path_prob_numpy = 0.03086068667471409
i = 3 Q = Variable containing:
 0.007

In [116]:
torch.stack(max_Q)

Variable containing:
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.0253  0.1556  0.3831  0.0326  0.0824  0.0438  0.0583  0.0940
 0.0078  0.1170  0.

In [113]:
model

SoftDecisionTree (
  (module_list): ModuleList (
    (0): Linear (784 -> 1)
    (1): Linear (784 -> 1)
    (2): Linear (784 -> 1)
  )
  (param_list): ParameterList (
  )
)

In [88]:
loss = 0.
for (path_prob, Q) in leaf_accumulator:
    TQ = torch.bmm(y.view(batch_size, 1, args['output_dim']), torch.log(Q).view(batch_size, args['output_dim'], 1)).view(-1,1)
    loss += path_prob * TQ
    path_prob_numpy = path_prob.cpu().data.numpy().reshape(-1)
    for i in range(batch_size):
        if max_prob[i] < path_prob_numpy[i]:
            max_prob[i] = path_prob_numpy[i]
            max_Q[i] = Q[i]
    loss = loss.mean()
    penalties = model.root.get_penalty()
    C = 0.
    for (penalty, lmbda) in penalties:
        C -= lmbda * 0.5 *(torch.log(penalty) + torch.log(1-penalty))
    output = torch.stack(max_Q)

RuntimeError: invalid argument 3: sizes do not match at /opt/conda/conda-bld/pytorch_1503970438496/work/torch/lib/THC/generated/../generic/THCTensorMathPointwise.cu:217

In [24]:
model = SoftDecisionTree(**args)
model.cuda()
model.cal_loss(data, target)

RuntimeError: invalid argument 2: size '[64 x 1 x 10]' is invalid for input of with 64 elements at /opt/conda/conda-bld/pytorch_1503970438496/work/torch/lib/TH/THStorage.c:41

In [57]:
path_prob_init = Variable(torch.ones(64, 1))
path_prob_init = path_prob_init.cuda()
root = InnerNode(1, **args)
root.cuda()
leaf_accumulator = root.cal_prob(data, path_prob_init)

AttributeError: 'InnerNode' object has no attribute 'cuda'