In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
from torch.autograd import Function
from torch.autograd import Variable
import torch.nn as nn
import torch.distributions as distribs

In [2]:
# Inherit from Function
class LinearFunction(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        input, weight, bias = ctx.saved_variables
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)

        return grad_input, grad_weight, grad_bias

In [10]:
from torch.autograd import gradcheck

# gradchek takes a tuple of tensor as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.
#input = (Variable(torch.randn(20,20).double(), requires_grad=True), Variable(torch.randn(30,20).double(), requires_grad=True))
input = Variable(torch.randn(20,20).double(), requires_grad=True)
weight = Variable(torch.randn(30,20).double(), requires_grad=True)
test = torch.autograd.gradcheck(LinearFunction.apply, (input, weight), raise_exception=False)
#test = gradcheck(LinearFunction.apply, input, eps=1e-6, atol=1e-4)
print(test)

True


In [4]:
class Linear(nn.Module):
    def __init__(self, input_features, output_features, bias=True):
        super(Linear, self).__init__()
        self.input_features = input_features
        self.output_features = output_features

        # nn.Parameter is a special kind of Variable, that will get
        # automatically registered as Module's parameter once it's assigned
        # as an attribute. Parameters and buffers need to be registered, or
        # they won't appear in .parameters() (doesn't apply to buffers), and
        # won't be converted when e.g. .cuda() is called. You can use
        # .register_buffer() to register buffers.
        # nn.Parameters can never be volatile and, different than Variables,
        # they require gradients by default.
        self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(output_features))
        else:
            # You should always register all possible parameters, but the
            # optional ones can be None if you want.
            self.register_parameter('bias', None)

        # Not a very smart way to initialize weights
        self.weight.data.uniform_(-0.1, 0.1)
        if bias is not None:
            self.bias.data.uniform_(-0.1, 0.1)

    def forward(self, input):
        # See the autograd section for explanation of what happens here.
        return LinearFunction.apply(input, self.weight, self.bias)

In [13]:
MyLinear = Linear(20, 3)
input = Variable(torch.randn(128, 20))
output = MyLinear(input)
output

Variable containing:
-0.2661  0.1846  0.2349
-0.0276 -0.2423  0.1566
 0.1213 -0.2678  0.3188
-0.2172  0.0197 -0.1826
-0.3477 -0.0759  0.0739
-0.0835 -0.4454  0.2019
-0.5255  0.0957  0.3214
 0.3367 -0.5961 -0.0632
-0.0468  0.3601 -0.1715
 0.4400 -0.6126  0.0687
 0.1707  0.1458  0.0840
-0.0321 -0.6874 -0.0445
-0.6224 -0.3587  0.3448
-0.0058  0.0234  0.0307
-0.1892  0.0536  0.2159
 0.1473  0.1312  0.0178
-0.4800  0.0551  0.4445
 0.0812 -0.2431  0.2469
-0.0938  0.0541  0.2484
 0.0072 -0.0870 -0.0755
-0.0036 -0.1731 -0.0166
-0.2493 -0.4714 -0.1641
-0.3201  0.2340  0.3886
-0.2374  0.0968  0.0951
 0.3261  0.3388 -0.0971
-0.1316  0.2211  0.1831
-0.0044 -0.2310 -0.0230
-0.0760 -0.0983 -0.1503
 0.2486 -0.2221  0.0081
-0.3169  0.0994  0.2958
 0.3289 -0.4454  0.1390
-0.1853  0.0086  0.3263
-0.5287  0.0671  0.0712
-0.2075 -0.1085  0.6731
 0.2755 -0.1900 -0.2789
 0.0627  0.1102 -0.0225
-0.4621  0.3226  0.2127
-0.1452  0.0555  0.3818
-0.2450  0.0927  0.3577
 0.2286 -0.3060 -0.1668
 0.1853  0.2147 -0.

In [18]:
# Inherit from Function
class NLLProbTargetFunction(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, target, logsoftmax):
        ctx.save_for_backward(target, logsoftmax)
        output = -torch.mul(target,logsoftmax).sum(dim=1)
        output = output.mean(dim=0)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.
        target, logsoftmax = ctx.saved_variables
        length = len(target)
        grad_target = grad_logsoftmax =  None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.
        if ctx.needs_input_grad[0]:
            grad_target = -(((logsoftmax/length).t() * grad_output).t())
        if ctx.needs_input_grad[1]:
            grad_logsoftmax = -(((target/length).t() * grad_output).t())
        return grad_target, grad_logsoftmax

In [19]:
from torch.autograd import gradcheck

# gradchek takes a tuple of tensor as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.
input = (Variable(torch.randn(5,3).double().cuda(), requires_grad=True), Variable(torch.randn(5,3).double().cuda(), requires_grad=True))
test = gradcheck(NLLProbTargetFunction.apply, input, eps=1e-6, atol=1e-4)
print(test)

True


In [20]:
class NLLProbTarget(nn.Module):
    """
    def __init__(self): #
        super(NLLProbTarget, self).__init__()
        self.target = target
        self.logsoftmax = logsoftmax
        # nn.Parameter is a special kind of Variable, that will get
        # automatically registered as Module's parameter once it's assigned
        # as an attribute. Parameters and buffers need to be registered, or
        # they won't appear in .parameters() (doesn't apply to buffers), and
        # won't be converted when e.g. .cuda() is called. You can use
        # .register_buffer() to register buffers.
        # nn.Parameters can never be volatile and, different than Variables,
        # they require gradients by default.
        
        self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(output_features))
        else:
            # You should always register all possible parameters, but the
            # optional ones can be None if you want.
            self.register_parameter('bias', None)

        # Not a very smart way to initialize weights
        self.weight.data.uniform_(-0.1, 0.1)
        if bias is not None:
            self.bias.data.uniform_(-0.1, 0.1)
    """
    def forward(self, target, logsoftmax):
        # See the autograd section for explanation of what happens here.
        return NLLProbTargetFunction.apply(target, logsoftmax)

In [21]:
def one_hot(size, index):
    """ Creates a matrix of one hot vectors.
        ```
        import torch
        import torch_extras
        setattr(torch, 'one_hot', torch_extras.one_hot)
        size = (3, 3)
        index = torch.LongTensor([2, 0, 1]).view(-1, 1)
        torch.one_hot(size, index)
        # [[0, 0, 1], [1, 0, 0], [0, 1, 0]]
        ```
    """
    mask = torch.FloatTensor(*size).fill_(0)
    ones = 1
    if isinstance(index, Variable):
        ones = Variable(torch.FloatTensor(index.size()).fill_(1))
        mask = Variable(mask, volatile=index.volatile)
    ret = mask.scatter_(1, index, ones)
    return ret

In [22]:
target = Variable(one_hot((5,10),torch.LongTensor([1,2,3,4,0]).view(-1, 1)), requires_grad=False)
logsoftmax = Variable(torch.randn(5,10), requires_grad=True)
Mynll = NLLProbTarget()

In [23]:
target

Variable containing:
    0     1     0     0     0     0     0     0     0     0
    0     0     1     0     0     0     0     0     0     0
    0     0     0     1     0     0     0     0     0     0
    0     0     0     0     1     0     0     0     0     0
    1     0     0     0     0     0     0     0     0     0
[torch.FloatTensor of size 5x10]

In [24]:
logsoftmax

Variable containing:
-0.1228  0.4064  0.0809  0.3064 -1.5306 -0.6408  0.0616  1.3632 -0.1728  1.5466
 0.5716  0.0561 -1.2361 -0.9009  4.0136 -0.2435 -0.5009  0.3759 -0.6373 -1.4164
-0.8670  1.2151 -0.8476  0.9116  0.2084  1.5593  0.5596 -0.2088 -0.2560 -2.5845
 0.2858  2.4982  0.7395  0.1254 -1.7044  0.3866  0.3665  1.1763  0.1173  0.5844
-0.2326  0.0557  0.6567 -0.7532  1.3934 -1.5826  2.9197 -0.2702 -1.9872 -0.3143
[torch.FloatTensor of size 5x10]

In [25]:
LogSoftmax = nn.LogSoftmax(dim=1)
logsoftmax = LogSoftmax(logsoftmax)
logsoftmax

Variable containing:
-2.9004 -2.3712 -2.6968 -2.4712 -4.3083 -3.4184 -2.7160 -1.4144 -2.9504 -1.2310
-3.5634 -4.0789 -5.3711 -5.0359 -0.1214 -4.3785 -4.6359 -3.7591 -4.7723 -5.5514
-3.6462 -1.5641 -3.6268 -1.8676 -2.5707 -1.2199 -2.2196 -2.9880 -3.0352 -5.3637
-2.9714 -0.7590 -2.5177 -3.1318 -4.9615 -2.8706 -2.8907 -2.0809 -3.1399 -2.6728
-3.5877 -3.2995 -2.6984 -4.1083 -1.9617 -4.9378 -0.4354 -3.6254 -5.3424 -3.6695
[torch.FloatTensor of size 5x10]

In [26]:
output = Mynll(target, logsoftmax)
output

Variable containing:
 3.6318
[torch.FloatTensor of size 1]

In [28]:
nll = torch.nn.NLLLoss()
nll(logsoftmax, Variable(torch.LongTensor([1,2,3,4,0])))

Variable containing:
 3.6318
[torch.FloatTensor of size 1]