In [55]:
import numpy as np
from scipy.stats import norm

class Activation:
    """Base activation class"""

    def __init__(self):
        self._input = None

    @property
    def input(self):
        """Returns the last input received by the activation"""
        return self._input

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes activation output
        
        Arguments:
            x: Input array of shape (`batch_size`, ...)

        Returns:
            An array of the same shape as `x`"""
        raise NotImplementedError()

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        """Computes loss gradient with respect to the activation input.
        
        Arguments:
            gradOutput: Gradient of loss function with recpect to the activation output.
                An array of the same shape as the array received in `__call__` method.

        Returns:
            An array of the same shape as `gradOutput`"""
        raise NotImplementedError()
    

class ReLU(Activation):
    """Implements ReLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.maximum(0, self._x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return gradOutput * (self._x >= 0)


class LeakyReLU(Activation):
    """Implements LeakyReLU activation layer"""

    def __init__(self, slope: float = 0.03):
        """Initializes LeakyReLU layer.

        Arguments:
            slope: the slope coeffitient of the activation."""
        self._slope = slope

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.where(x > 0, x, self._slope * x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return np.where(self._x > 0, gradOutput, self._slope * gradOutput)

class GeLU(Activation):
    """Implements GeLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return self._x * norm.cdf(self._x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return (self._x * norm.pdf(self._x) + norm.cdf(self._x)) * gradOutput


class SiLU(Activation):
    """Implements SiLU (swish) activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return self._x * self.sigmoid(self._x)

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1/(1 + np.exp(-x))
    
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return (self._x * self.sigmoid(self._x) * (1 - self.sigmoid(self._x)) + self.sigmoid(self._x)) * gradOutput


class Softplus(Activation):
    """Implements Softplus (SmoothReLU) activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.log(1 + np.exp(self._x))

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1/(1 + np.exp(-x))
    
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return self.sigmoid(self._x) * gradOutput


class ELU(Activation):
    """Implements ELU activation layer"""

    def __init__(self, alpha: float = 1):
        """Initializes ELU layer.

        Arguments:
            alpha: the alpha coeffitient of the activation."""
        self.alpha = alpha

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.where(x > 0, x, self.alpha * (np.exp(x) - 1))
        
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return np.where(self._x > 0, gradOutput, self.alpha * np.exp(self._x) * gradOutput)

class Sigmoid(Activation):
    """Implements Sigmoid activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return self.sigmoid(self._x)

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1/(1 + np.exp(-x))
    
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return self.sigmoid(self._x) * (1 - self.sigmoid(self._x)) * gradOutput


class Tanh(Activation):
    """Implements Tanh activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.tanh(x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return 1/(np.square(np.cosh(self._x))) * gradOutput


class Softmax(Activation):
    """Implements Softmax activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes Softmax activation output
        
        Arguments:
            x: Input array of shape (`batch_size`, `n_features`)

        Returns:
            An array of the same shape as `x`"""
        self._x = x
        self._input = x
        out = np.exp(x)
        
        return out / np.sum(out, axis=1, keepdims=True)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        
        out = np.exp(self._x)
        sum_out = np.sum(out, axis=1, keepdims=True)
        sm = out / sum_out
        
        smismj = np.einsum('...i,...j->...ij', sm, sm)
        b = np.eye(sm.shape[1])
        smi = np.einsum('ij,jk->ijk', sm, b)
        grad_ = smi - smismj
        return np.einsum('...i,...ij->...j', gradOutput, grad_)
        
        

In [56]:
x = np.array([[1,2,3,4,5], 
              [6,7,8,9,10],
              [11,12,13,14,15]])
grad_x = x * 0.8

In [57]:
a = Softmax()
a(x)
a.grad(grad_x)

array([[-0.0321893 , -0.06215166, -0.10004289, -0.08464745,  0.2790313 ],
       [-0.0321893 , -0.06215166, -0.10004289, -0.08464745,  0.2790313 ],
       [-0.0321893 , -0.06215166, -0.10004289, -0.08464745,  0.2790313 ]])

In [60]:
a = ReLU()
a(x)
a.grad(grad_x)
a.input

array([-2, -1,  0,  1])

In [4]:
a = LeakyReLU()
a(x)
a.grad(grad_x)

array([-0.24 , -0.216, -0.192, -0.168, -0.144, -0.12 , -0.096, -0.072,
       -0.048, -0.024,  0.   ,  0.8  ,  1.6  ,  2.4  ,  3.2  ,  4.   ,
        4.8  ,  5.6  ,  6.4  ,  7.2  ])

In [5]:
a = GeLU()
a(x)
a.grad(grad_x)

array([6.09472008e-21, 6.58003491e-17, 2.54694865e-13, 3.50914090e-10,
       1.70249805e-07, 2.85877840e-05, 1.61167892e-03, 2.86695533e-02,
       1.36370882e-01, 6.66523765e-02, 0.00000000e+00, 8.66652376e-01,
       1.73637088e+00, 2.42866955e+00, 3.20161168e+00, 4.00002859e+00,
       4.80000017e+00, 5.60000000e+00, 6.40000000e+00, 7.20000000e+00])

In [6]:
a = SiLU()
a(x)
a.grad(grad_x)

array([ 3.26848167e-03,  7.10654092e-03,  1.50179279e-02,  3.05787836e-02,
        5.91668764e-02,  1.06189730e-01,  1.68526768e-01,  2.11449854e-01,
        1.45254798e-01, -5.78635905e-02,  0.00000000e+00,  7.42136409e-01,
        1.74525480e+00,  2.61144985e+00,  3.36852677e+00,  4.10618973e+00,
        4.85916688e+00,  5.63057878e+00,  6.41501793e+00,  7.20710654e+00])

In [7]:
a = ELU()
a(x)
a.grad(grad_x)

array([-3.63199438e-04, -8.88550589e-04, -2.14696082e-03, -5.10653901e-03,
       -1.18980104e-02, -2.69517880e-02, -5.86100444e-02, -1.19488964e-01,
       -2.16536453e-01, -2.94303553e-01,  0.00000000e+00,  8.00000000e-01,
        1.60000000e+00,  2.40000000e+00,  3.20000000e+00,  4.00000000e+00,
        4.80000000e+00,  5.60000000e+00,  6.40000000e+00,  7.20000000e+00])

In [8]:
a = Softplus()
a(x)
a.grad(grad_x)

array([-3.63182950e-04, -8.88440947e-04, -2.14624083e-03, -5.10188669e-03,
       -1.18685912e-02, -2.67714037e-02, -5.75558719e-02, -1.13822096e-01,
       -1.90724675e-01, -2.15153137e-01,  0.00000000e+00,  5.84846863e-01,
        1.40927532e+00,  2.28617790e+00,  3.14244413e+00,  3.97322860e+00,
        4.78813141e+00,  5.59489811e+00,  6.39785376e+00,  7.19911156e+00])

In [10]:
a = Sigmoid()
a(x)
a.grad(grad_x)

array([-0.00036317, -0.00088833, -0.00214552, -0.00509724, -0.01183924,
       -0.02659223, -0.05652066, -0.10842398, -0.16798974, -0.15728955,
        0.        ,  0.15728955,  0.16798974,  0.10842398,  0.05652066,
        0.02659223,  0.01183924,  0.00509724,  0.00214552,  0.00088833])

In [11]:
a = Tanh()
a(x)
a.grad(grad_x)

array([-6.59569156e-08, -4.38623403e-07, -2.88089982e-06, -1.86262123e-05,
       -1.17967428e-04, -7.26332924e-04, -4.29104219e-03, -2.36784892e-02,
       -1.13041320e-01, -3.35979473e-01,  0.00000000e+00,  3.35979473e-01,
        1.13041320e-01,  2.36784892e-02,  4.29104219e-03,  7.26332924e-04,
        1.17967428e-04,  1.86262123e-05,  2.88089982e-06,  4.38623403e-07])

In [27]:
a = np.array([[1, 0],
               [2, 3]])

In [28]:
-np.einsum('...i,...j->...ij', a, a)

array([[[-1,  0],
        [ 0,  0]],

       [[-4, -6],
        [-6, -9]]])

In [29]:
a

array([[1, 0],
       [2, 3]])

In [34]:
b = np.eye(a.shape[1])

In [35]:
np.einsum('ij,jk->ijk', a, b)

array([[[1., 0.],
        [0., 0.]],

       [[2., 0.],
        [0., 3.]]])

In [39]:
res = np.einsum('ij,jk->ijk', a, b) - np.einsum('...i,...j->...ij', a, a)
res

array([[[ 0.,  0.],
        [ 0.,  0.]],

       [[-2., -6.],
        [-6., -6.]]])

In [40]:
np.einsum('...i,...ij->...j', a, res)

array([[  0.,   0.],
       [-22., -30.]])