In [2]:
import numpy as np
from scipy.linalg import inv, sqrtm 
from scipy.special import expit 
from scipy.misc import logsumexp
from scipy.special import xlogy
# import torch
# from .Module import Module
# from torch.nn.functional import _Reduction
# from .Criterion import Criterion

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [3]:
class Module(object):
    
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.train = True
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    
    def forward(self, inpt):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(inpt)

    def backward(self, inpt, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(inpt, gradOutput)
        self.accGradParameters(inpt, gradOutput)
        return self.gradInput
    

    def updateOutput(self, inpt):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, inpt, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, inpt, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def training(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [4]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
        self.y = []
   
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, inpt):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
            
        Just write a little loop. 
        """
        # <Your Code Goes Here>
        self.output = inpt
        for module in self.modules:
            module.y = self.output
            self.output = module.forward(self.output)
            
        return self.output

    def backward(self, inpt, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
             
        !!!
                
        To each module you need to provide the input, module saw while forward pass, 
        it is used while computing gradients. 
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass) 
        and NOT `input` to this Sequential module. 
        
        !!!
        
        """
        # <Your Code Goes Here>
       
        self.current_grad = gradOutput
        for module in reversed(self.modules):
            self.current_grad = module.backward(module.y, self.current_grad)
        
        return self.gradInput
    
        
        # print (gradOutput)
        # self.gradInput = gradOutput
        # for module in reversed(self.modules):
        #    module.backward(module.output, self.gradInput)
        # return self.gradInput


    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)

# Layers

- input:   **`batch_size x n_features1`**
- output: **`batch_size x n_features2`**

In [5]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, dense layer, InnerProductLayer in caffe. 
    
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
       
        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, inpt):
        self.output = inpt.dot(self.W.T) + self.b
        
        return self.output

    def updateGradInput(self, inpt, gradOutput):
        self.gradInput = gradOutput.dot(self.W)
        
        return self.gradInput
    
    def accGradParameters(self, inpt, gradOutput):
        self.gradW = gradOutput.T.dot(inpt)
        self.gradb = gradOutput.sum(axis=0)
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

This one is probably the hardest but as others only takes 5 lines of code in total. 
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [6]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
    
    def updateOutput(self, inpt):
        # normalization
        self.output = np.subtract(inpt, inpt.max(axis=1, keepdims=True))
        
        # <Your Code Goes Here>
        self.output = np.exp(inpt)/np.sum(np.exp(inpt), axis=1, keepdims=True)
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        #s_e = np.sum(np.exp(input), axis=1, keepdims=True)
        #self.gradInput = (np.exp(input) * np.exp(gradOutput) - np.exp(gradOutput))/s_e
        exp = np.exp(np.subtract(inpt, inpt.max(axis=1, keepdims=True)))
        denom = exp.sum(axis=1, keepdims=True)
        e = np.diag(exp.dot(gradOutput.T))
        self.gradInput = - np.diag(e).dot(exp)    
        self.gradInput += exp * denom * gradOutput
        self.gradInput /= denom**2
        return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement a part of the layer: mean subtraction. That is, the module should calculate mean value for every feature (every column) and subtract it.

Note, that you need to estimate the mean over the dataset to be able to predict on test examples. The right way is to create a variable which will hold smoothed mean over batches (exponential smoothing works good) and use it when forwarding test examples.

When training calculate mean as folowing: 
```
    mean_to_subtract = self.old_mean * alpha + batch_mean * (1 - alpha)
```
when evaluating (`self.training == False`) set $alpha = 1$.


- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [7]:
class BatchMeanSubtraction(Module):
    EPS = 1e-3
    def __init__(self, alpha = 0.95):
        super(BatchMeanSubtraction, self).__init__()
        
        self.alpha = alpha
        self.old_mean = None
        self.old_variance = None
        
    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        if self.training:
            batch_mean = inpt.mean(axis=0)
            batch_variance = ((inpt - batch_mean) ** 2).mean(axis=0)
            self.old_mean = self.old_mean * self.alpha + batch_mean * (1 - self.alpha)
            self.old_variance = self.old_variance * self.alpha + batch_variance * (1 - self.alpha)
            self.output = (inpt - batch_mean) / np.sqrt(batch_variance + self.EPS)
        else:
            self.output = (inpt - self.old_mean) / np.sqrt(self.old_variance + self.EPS)
        
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        batch_size, n_feats = inpt.shape
        batch_mean = inpt.mean(axis=0, keepdims=True)
        batch_variance = ((inpt - batch_mean) ** 2).mean(axis=0, keepdims=True)
       
        gradVar = -0.5 * ((batch_variance + self.EPS) ** (-3. / 2.)) * \
            (gradOutput * (inpt - batch_mean)).sum(axis=0, keepdims=True)
        gradMean = -gradOutput.sum(axis=0, keepdims=True) / np.sqrt(batch_variance + self.EPS) - \
            2 * gradVar * (inpt - batch_mean).sum(axis=0) / batch_size
        self.gradInput = gradOutput / np.sqrt(batch_variance + self.EPS) + \
            gradVar * 2 * (inpt - batch_mean) / batch_size + \
            gradMean / batch_size
        
        return self.gradInput
    
    def __repr__(self):
        return "BatchMeanNormalization"

Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. 

This is a very cool regularizer. In fact, when you see your net is overfitting try to add more dropout. It is hard to test, since every `forward` requires sampling a new mask, that is the only reason we need `fix_mask` parameter in there. 

While training (`self.training == True`) it should sample a mask on each iteration (for every batch). When testing this module should implement identity transform i.e. `self.output = input * p`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [8]:
from scipy.stats import bernoulli

class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        
        self.p = p
        self.mask = None

    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        if self.train:
            self.mask = int(np.random.rand(*inpt.shape) > self.p)/(1.-self.p)
            self.output = self.mask * inpt
        else:
            self.output = inpt
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        if self.train:
            self.gradInput = gradOutput * self.mask
        else:
            self.gradInput = gradOutput
        
        return self.gradInput
        
    def __repr__(self):
        return "Dropout"

# Activation functions

Implement well-known **Sigmoid** non-linearity

In [9]:
class Sigmoid(Module):
    def __init__(self):
         super(Sigmoid, self).__init__()
    
    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        self.output = expit(inpt)
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        self.gradInput = gradOutput * self.output * (1. - self.output)   
        
        return self.gradInput
    
    def __repr__(self):
        return "Sigmoid"

Implement **hyperbolic tangent** non-linearity (aka **Tanh**): 
Note that Tanh is scaled version of the sigmoid function.

In [10]:
class Tanh(Module):
    def __init__(self):
         super(Tanh, self).__init__()
    
    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        self.output = np.tanh(inpt)
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        self.gradInput = gradOutput * (1.-self.output * self.output)
        
        return self.gradInput
    
    def __repr__(self):
        return "Tanh"

Implement **Rectified Linear Unit** non-linearity (aka **ReLU**): 

In [22]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        self.output = np.maximum(inpt, 0)
         
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        self.gradInput = np.multiply(gradOutput, inpt > 0)
        
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope. 

In [13]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03, negval = 1 / 100, inplace = False):
        super(LeakyReLU, self).__init__()
        if isinstance(negval, bool):
            inplace = negval
            self.negval = 1 / 100
        else:
            self.negval = negval
            
        self.inplace = inplace
        if self.negval < 0:
            self.inplace = False
        
        self.slope = slope
        
    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        self._backend.LeakyReLU_updateOutput(self._backend.library_state, inpt, self.output, self.negval, self.inplace)
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        self._backend.LeakyReLu_updateGradInput(self._backend.library_state, inpt, gradOutput, self.gradInput, self.negval, self.inplace)
        
        return self.gradInput
    
    def __repr__(self):
        return str(type(self)) + '({:.4f})'.format(self.negval)

Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [14]:
class ELU(Module):
    def __init__(self, alpha = 1.0, inplace = False):
        assert type(alpha) == float
        super(ELU, self).__init__()
                
        self.alpha = alpha
        self.inplace = inplace
        
    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        self._backend.ELU_updateOutput(self._backend.library_state, inpt, self.output, self.alpha, 1.0, self.inplace)
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        self._backend.ELU_updateGradInput(self._backend.library_state, gradOutput, self.gradInput, self.output, self.alpha, 1.0)
        
        return self.gradInput
    
    def __repr__(self):
        return '{}(alpha={:.3f})'.format(str(type(self)), self.alpha)

Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [15]:
class SoftPlus(Module):
    def __init__(self, beta=1, threshold=20):
        super(SoftPlus, self).__init__()
        self.beta = beta # Beta controls sharpness of transfer function
        self.threshold = threshold  # Avoid floating point issues with exp(x), x>20
    
    def updateOutput(self, inpt):
        # <Your Code Goes Here>
        # f(x) = 1/beta * log(1 + exp(beta * x))
        self._backend.SoftPlus_updateOutput(self._backend.library_state, inpt, self.output, self.beta, self.threshold)
        
        return self.output
    
    def updateGradInput(self, inpt, gradOutput):
        # <Your Code Goes Here>
        # d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
        self._backend.SoftPlus_updateGradInput(self._backend.library_state, inpt, gradOutput, self.gradInput, self.output, self.beta, self.threshold)
        
        return self.gradInput
    
    def __repr__(self):
        return "SoftPlus"

# Criterions

Criterions are used to score the models answers. 

In [17]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.one_ht = None #Make np.eye vector of N_classes size
    
    def forward(self, inpt, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(inpt, target)

    def backward(self, inpt, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(inpt, target)
    
    def updateOutput(self, inpt, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, inpt, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression.

In [20]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()
    
    def updateOutput(self, inpt, target):   
        # <Your Code Goes Here>
        self.output = np.sum(np.power(inpt - target, 2)) / inpt.shape[0]
        
        return self.output 
 
    def updateGradInput(self, inpt, target):
        # <Your Code Goes Here>
        self.gradInput = (inpt - target) * 2 / inpt.shape[0]
       
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

You task is to implement the **CrossEntropyCriterion**. It should implement [multiclass log loss](http://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy). Nevertheless there is a sum over `y` (target) in that formula, 
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. 

In [14]:
class CrossEntropyCriterion(Criterion):
    def __init__(self):
        super(CrossEntropyCriterion, self).__init__()
        
        # <Your Code Goes Here>
    def softmax(self, z):
        maxsub = z - np.max(z, axis=0)
        sm = np.exp(maxsub)
        
        return sm/sm.sum(axis=0)
    
        # <Your Code Goes Here>
    def crossEntropyOfSoftMax(self, x, t):
        
        return np.sum(xlogy(t,t) + t * (-x + logsumexp(x, axis=0)))
    
    def updateOutput(self, inpt, target): 
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(inpt, 1 - 1e-15) )
        
        # <Your Code Goes Here>
        if self.one_ht is None:
            self.one_ht = np.eye(inp.shape[0])
        targs = self.one_ht[targets].T
        self.output = self.crossEntropyOfSoftMax(inpt, targs)
        
        return self.output

    def updateGradInput(self, inpt, target):
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(inpt, 1 - 1e-15) )
                
        # <Your Code Goes Here>
        self.gradInput = self.softmax(inpt) - self.one_ht[targets].T
        
        return self.gradInput
    
    def __repr__(self):
        return "CrossEntropyCriterion"

    #---------------------------------------------------------------------
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(CrossEntropyCriterion, self)
        super(CrossEntropyCriterion, self).__init__()
        
    def updateOutput(self, inpt, target): 
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(inpt, 1 - 1e-15) )
        
        # Your code goes here. ################################################
        self.output = np.sum(np.einsum('ij,ij->i', np.log(input), target)) / input.shape[0]
        #print (target, input, target.dot(input.T))
        return self.output

    def updateGradInput(self, input, target):
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(input, 1 - 1e-15) )
        print('cri', input, target)
        # Your code goes here. ################################################
        self.gradInput = (target / input_clamp) / input.shape[0]
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"

**MultiLabelCriterion** for atribute classification.

In [19]:
class MultiLabelCriterion(Criterion):
    def __init__(self, sizeAverage = True):
        super(MultiLabelCriterion, self).__init__()
        self.sizeAverage = sizeAverage
        self.isTarget = torch.Tensor()
        self.output_tensor = None
    
    def updateOutput(self, inpt, target): 
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(inpt, 1 - 1e-15) )
        
        # <Your Code Goes Here>
        if self.output_tensor is None:
            self.output_tensor = input.new(1)
        target = target.long()
        self._backend.MultiLabelMarginCriterion_updateOutput(self._backend.library_state, inpt, target, self.output_tensor, self.isTarget, _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),)
        self.output = self.output_tensor[0].item()
        
        return self.output

    def updateGradInput(self, inpt, target):
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(inpt, 1 - 1e-15) )
                
        # <Your Code Goes Here>
        target = target.long()
        implicit_gradOutput = torch.ones(1).type_as(inpt)
        self._backend.MultiLabelMarginCriterion_updateGradInput(self._backend.library_state, inpt, target, implicit_gradOutput, self.gradInput, self.isTarget, _Reduction.legacy_get_enum(self.sizeAverage, True, emit_warning=False),)
        
        return self.gradInput
    
    def __repr__(self):
        return "MultiLabelCriterion"

In [None]:
class SoftMax(Module):
    def __init__(self):
        super(SoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        input = np.subtract(input, input.max(axis=1, keepdims=True))
        print (np.sum(input), np.sum(self.output))
        # Your code goes here. ################################################
        self.output = np.exp(input) / np.sum(np.exp(input), axis=1, keepdims=True)

        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        # s_e = np.sum(np.exp(input), axis=1, keepdims=True)
        # self.gradInput = (np.exp(input) * np.exp(gradOutput) - np.exp(gradOutput))/s_e
        print('sm', input.shape, gradOutput.shape)
        exp = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
        denom = exp.sum(axis=1, keepdims=True)
        e = np.diag(exp.dot(gradOutput.T))
        self.gradInput = - np.diag(e).dot(exp)
        self.gradInput += exp * denom * gradOutput
        self.gradInput /= denom ** 2
        return self.gradInput

    def __repr__(self):
        return "SoftMax"

class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()

    def updateOutput(self, input, target):
        # Use this trick to avoid numerical errors
        eps = 1e-15
        input_clamp = np.clip(input, eps, 1 - eps)

        # Your code goes here. ################################################
        self.output = np.sum(np.einsum('ij,ij->i', np.log(input_clamp), target)) / input.shape[0]
        # print (target, input, target.dot(input.T))
        return self.output

    def updateGradInput(self, input, target):
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(input, 1 - 1e-15))
        #print('cri', input, target)
        # Your code goes here. ################################################
        self.gradInput = (target / input_clamp) / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterion"


def net_image(net, step=1):
    ir = np.arange(-6.0, 6.0, step)
    jr = np.arange(-6.0, 6.0, step)
    pic = np.zeros([ir.shape[0], ir.shape[0]])
    #print (pic.shape)
    for ind, i in enumerate(ir):
        for jnd, j in enumerate(jr):
            pred = net.forward(np.array([[i, j]]))
            # print (pred, i, j)
            pic[ind, jnd] = pred[0][0] * 250
            # if 0.3 < pred[0][0] < 0.6:

            #   x1.append(j)
            #  x2.append(i)
    return pic