In [55]:
import numpy as np
from scipy.stats import norm

class Activation:
    """Base activation class"""

    def __init__(self):
        self._input = None

    @property
    def input(self):
        """Returns the last input received by the activation"""
        return self._input

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes activation output
        
        Arguments:
            x: Input array of shape (`batch_size`, ...)

        Returns:
            An array of the same shape as `x`"""
        raise NotImplementedError()

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        """Computes loss gradient with respect to the activation input.
        
        Arguments:
            gradOutput: Gradient of loss function with recpect to the activation output.
                An array of the same shape as the array received in `__call__` method.

        Returns:
            An array of the same shape as `gradOutput`"""
        raise NotImplementedError()
    

class ReLU(Activation):
    """Implements ReLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.maximum(0, self._x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return gradOutput * (self._x >= 0)


class LeakyReLU(Activation):
    """Implements LeakyReLU activation layer"""

    def __init__(self, slope: float = 0.03):
        """Initializes LeakyReLU layer.

        Arguments:
            slope: the slope coeffitient of the activation."""
        self._slope = slope

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.where(x > 0, x, self._slope * x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return np.where(self._x > 0, gradOutput, self._slope * gradOutput)

class GeLU(Activation):
    """Implements GeLU activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return self._x * norm.cdf(self._x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return (self._x * norm.pdf(self._x) + norm.cdf(self._x)) * gradOutput


class SiLU(Activation):
    """Implements SiLU (swish) activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return self._x * self.sigmoid(self._x)

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1/(1 + np.exp(-x))
    
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return (self._x * self.sigmoid(self._x) * (1 - self.sigmoid(self._x)) + self.sigmoid(self._x)) * gradOutput


class Softplus(Activation):
    """Implements Softplus (SmoothReLU) activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.log(1 + np.exp(self._x))

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1/(1 + np.exp(-x))
    
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return self.sigmoid(self._x) * gradOutput


class ELU(Activation):
    """Implements ELU activation layer"""

    def __init__(self, alpha: float = 1):
        """Initializes ELU layer.

        Arguments:
            alpha: the alpha coeffitient of the activation."""
        self.alpha = alpha

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.where(x > 0, x, self.alpha * (np.exp(x) - 1))
        
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return np.where(self._x > 0, gradOutput, self.alpha * np.exp(self._x) * gradOutput)

class Sigmoid(Activation):
    """Implements Sigmoid activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return self.sigmoid(self._x)

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1/(1 + np.exp(-x))
    
    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return self.sigmoid(self._x) * (1 - self.sigmoid(self._x)) * gradOutput


class Tanh(Activation):
    """Implements Tanh activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        self._x = x
        self._input = x
        return np.tanh(x)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        return 1/(np.square(np.cosh(self._x))) * gradOutput


class Softmax(Activation):
    """Implements Softmax activation layer"""

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Computes Softmax activation output
        
        Arguments:
            x: Input array of shape (`batch_size`, `n_features`)

        Returns:
            An array of the same shape as `x`"""
        self._x = x
        self._input = x
        out = np.exp(x)
        
        return out / np.sum(out, axis=1, keepdims=True)

    def grad(self, gradOutput: np.ndarray) -> np.ndarray:
        
        out = np.exp(self._x)
        sum_out = np.sum(out, axis=1, keepdims=True)
        sm = out / sum_out
        
        smismj = np.einsum('...i,...j->...ij', sm, sm)
        b = np.eye(sm.shape[1])
        smi = np.einsum('ij,jk->ijk', sm, b)
        grad_ = smi - smismj
        return np.einsum('...i,...ij->...j', gradOutput, grad_)
        
        

In [58]:
x = np.array([[1,2,3,4,5], 
              [6,7,8,9,10],
              [11,12,13,14,15]])
grad_x = x * 0.8

In [59]:
a = Softmax()
a(x)
a.grad(grad_x)

array([[-0.0321893 , -0.06215166, -0.10004289, -0.08464745,  0.2790313 ],
       [-0.0321893 , -0.06215166, -0.10004289, -0.08464745,  0.2790313 ],
       [-0.0321893 , -0.06215166, -0.10004289, -0.08464745,  0.2790313 ]])

In [60]:
a = ReLU()
a(x)
a.grad(grad_x)
a.input

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15]])

In [61]:
a = LeakyReLU()
a(x)
a.grad(grad_x)

array([[ 0.8,  1.6,  2.4,  3.2,  4. ],
       [ 4.8,  5.6,  6.4,  7.2,  8. ],
       [ 8.8,  9.6, 10.4, 11.2, 12. ]])

In [62]:
a = GeLU()
a(x)
a.grad(grad_x)

array([[ 0.86665238,  1.73637088,  2.42866955,  3.20161168,  4.00002859],
       [ 4.80000017,  5.6       ,  6.4       ,  7.2       ,  8.        ],
       [ 8.8       ,  9.6       , 10.4       , 11.2       , 12.        ]])

In [63]:
a = SiLU()
a(x)
a.grad(grad_x)

array([[ 0.74213641,  1.7452548 ,  2.61144985,  3.36852677,  4.10618973],
       [ 4.85916688,  5.63057878,  6.41501793,  7.20710654,  8.00326848],
       [ 8.8014697 ,  9.60064882, 10.40028209, 11.20012107, 12.00005139]])

In [64]:
a = ELU()
a(x)
a.grad(grad_x)

array([[ 0.8,  1.6,  2.4,  3.2,  4. ],
       [ 4.8,  5.6,  6.4,  7.2,  8. ],
       [ 8.8,  9.6, 10.4, 11.2, 12. ]])

In [65]:
a = Softplus()
a(x)
a.grad(grad_x)

array([[ 0.58484686,  1.40927532,  2.2861779 ,  3.14244413,  3.9732286 ],
       [ 4.78813141,  5.59489811,  6.39785376,  7.19911156,  7.99963682],
       [ 8.79985303,  9.59994102, 10.39997649, 11.19999069, 11.99999633]])

In [66]:
a = Sigmoid()
a(x)
a.grad(grad_x)

array([[1.57289547e-01, 1.67989737e-01, 1.08423983e-01, 5.65206599e-02,
        2.65922267e-02],
       [1.18392446e-02, 5.09723861e-03, 2.14552109e-03, 8.88331318e-04,
        3.63166462e-04],
       [1.46970058e-04, 5.89837138e-05, 2.35073196e-05, 9.31310617e-06,
        3.67082560e-06]])

In [67]:
a = Tanh()
a(x)
a.grad(grad_x)

array([[3.35979473e-01, 1.13041320e-01, 2.36784892e-02, 4.29104219e-03,
        7.26332924e-04],
       [1.17967428e-04, 1.86262123e-05, 2.88089982e-06, 4.38623403e-07,
        6.59569156e-08],
       [9.81892768e-09, 1.44965166e-09, 2.12538104e-10, 3.09765125e-11,
        4.49165903e-12]])

In [68]:
a = np.array([[1, 0],
               [2, 3]])

In [69]:
-np.einsum('...i,...j->...ij', a, a)

array([[[-1,  0],
        [ 0,  0]],

       [[-4, -6],
        [-6, -9]]])

In [70]:
a

array([[1, 0],
       [2, 3]])

In [71]:
b = np.eye(a.shape[1])

In [72]:
np.einsum('ij,jk->ijk', a, b)

array([[[1., 0.],
        [0., 0.]],

       [[2., 0.],
        [0., 3.]]])

In [73]:
res = np.einsum('ij,jk->ijk', a, b) - np.einsum('...i,...j->...ij', a, a)
res

array([[[ 0.,  0.],
        [ 0.,  0.]],

       [[-2., -6.],
        [-6., -6.]]])

In [74]:
np.einsum('...i,...ij->...j', a, res)

array([[  0.,   0.],
       [-22., -30.]])