## It's time for `classification` problem

We spend time on solving regression problem!<br>
For classification problem we need to make 2 things!

- Softmax
- Cross Entropy Loss

**But before let's make `slice` function** to manage the tensor's more easily!

## `Slice`

In [None]:
class GetItem(Function):
    def __init__(self, slices):
        self.slices = slices
        
    def forward(self, x):
        y = x[self.slices]
        return y
    
    def backward(self, gy):
        x, = self.inputs
        gx = GetItemGrad(self.slices, x.shape)(gy)
        return gx
    
def get_item(x, slices):
    return GetItem(slices)(x)

There is no funciton for `Slice` so let's make another.

In [None]:
class GetItemGrad(Funtion):
    def __init__(self, slices, in_shape):
        self.slices = slices
        self.in_shape = in_shape
        
    def forward(self, gy):
        gx = np.zeros(self.in_shape)
        np.add.at(gx, self.slices, gy)
        return gx
    
    def backward(self, ggx):
        return get_item(ggx, self.slices)

- How `np.add.at` works?

In [12]:
import numpy as np

a = np.zeros((2, 3))
a

array([[0., 0., 0.],
       [0., 0., 0.]])

In [13]:
b = np.ones((3,))
b

array([1., 1., 1.])

In [7]:
slices = 1
np.add.at(a, slices, b)
a

array([[0., 0., 0.],
       [1., 1., 1.]])

Same work :)!

In [14]:
a[slices] = b
a

array([[0., 0., 0.],
       [1., 1., 1.]])

### Test!

In [18]:
import numpy as np
from dezero import Variable
import dezero.functions as F

x = Variable(np.array([[1, 2, 3],
                       [4, 5, 6]]))
y = F.get_item(x, 1)
print(y)

Variable([4 5 6])


In [19]:
y.backward()
print(x.grad)

Variable([[0. 0. 0.]
          [1. 1. 1.]])


In [16]:
print(y.grad)

None


In [20]:
x = Variable(np.array([[1, 2, 3],
                       [4, 5, 6]]))
indices = np.array([0, 0, 1])
y = F.get_item(x, indices)
print(y)

Variable([[1 2 3]
          [1 2 3]
          [4 5 6]])


In [21]:
y.backward()
print(x.grad)

Variable([[2. 2. 2.]
          [1. 1. 1.]])


### How does `slices` handle `1:3`?

In [23]:
class IndiceCheck:
    def __getitem__(self, slices):
        print(slices)

In [24]:
ic = IndiceCheck()

In [25]:
ic[1]

1


In [26]:
ic[1:3]

slice(1, 3, None)


In [25]:
ic[1]

1


In [29]:
F.get_item(x, slice(1, 3, None))

Variable([[4 5 6]])

In [27]:
slice(1, 3)

slice(1, 3, None)

## `Slice` for Variable `__getitem__`

```python
Variable.__getitem__ = F.get_item
```

In [21]:
y.backward()
print(x.grad)

Variable([[2. 2. 2.]
          [1. 1. 1.]])


In [1]:
import numpy as np
from dezero import Variable

x = Variable(np.array([[1, 2, 3],
                       [4, 5, 6]]))
y = x[1]
print(y)

Variable([4 5 6])


In [2]:
y.backward()
print(x.grad)

Variable([[0. 0. 0.]
          [1. 1. 1.]])


In [3]:
x = Variable(np.array([[1, 2, 3],
                       [4, 5, 6]]))
y = x[:,2]
print(y)

Variable([3 6])


In [4]:
y.backward()
print(x.grad)

Variable([[0. 0. 1.]
          [0. 0. 1.]])


## `Softmax` Function

## $p_{k} = \frac{exp(y_{k})}{\sum^{n}_{i=1} exp(y_{i})}$

In [5]:
from dezero.models import MLP

model = MLP((10, 3))

In [6]:
x = np.array([[0.2, -0.4]])
y = model(x)
print(y)

Variable([[0.44170413 0.46544621 0.30717265]])


`yield from` used to make generator inside generator

In [7]:
from dezero import Variable, as_variable
import dezero.functions as F

def softmax1d(x):
    x = as_variable(x)
    y = F.exp(x)
    sum_y = F.sum(y)
    return y / sum_y

In [11]:
x = Variable(np.array([[0.2, -0.4]]))
y = model(x)
p = softmax1d(y)
print(p)

Variable([[0.34504752 0.35333769 0.30161479]])


- But what if with batch?

In [52]:
x = Variable(np.array([[0.2, -0.4],
                       [0.3, -0.5]]))
y = model(x)
p = softmax1d(y)
print(p)

Variable([[0.17299718 0.17715364 0.15122123]
          [0.17739957 0.16770525 0.15352312]])


We can see it breaks!

In [53]:
def softmax_simple(x, axis=1):
    x = as_variable(x)
    y = F.exp(x)
    sum_y = F.sum(y, axis=axis, keepdims=True)
    return y / sum_y

In [54]:
x = Variable(np.array([[0.2, -0.4],
                       [0.3, -0.5]]))
y = model(x)
p = softmax_simple(y)
print(p)

Variable([[0.34504752 0.35333769 0.30161479]
          [0.35577543 0.33633344 0.30789113]])


It works again :)!

### Sidetalk! - `Max`, `Min`, `Clip`

In [62]:
x = np.array([[1, 2, 3],
              [3, 5, 4]])
y = x.max(axis=1, keepdims=True)
print(y)

[[3]
 [5]]


In [63]:
cond = (x == y)
cond

array([[False, False,  True],
       [False,  True, False]])

In [None]:
def max_backward_shape(x, axis):
    if axis is None:
        axis = range(x.ndim)
    elif isinstance(axis, int):
        axis = (axis,)
    else:
        axis = axis
    
    shape = [s if ax not in axis else 1 for ax, s in enumerate(x.shape)]
    return shape

In [None]:
class Max(Function):
    def __init__(self, axis=None, keepdims=False):
        self.axis = axis
        self.keepdims = keepdims
        
    def forward(self, x):
        y = x.max(axis=self.axis, keepdims=self.keepdims)
        return y
    
    def backward(self, gy):
        x = self.inputs[0]
        y = self.outputs[0]()
        
        shape = utils.max_backward_shape(x, self.axis)
        gy = gy.reshape(shape)
        y = y.reshape(shape)
        
        cond = (x.data == y.data)
        gy = broadcast_to(gy, cond.shape)
        
        gx = gy * cond
        
        return gx
    

class Min(Max):
    def forward(self, x):
        y = x.min(axis=self.axis, keepdims=self.keepdims)
        return y
    
def max(x, axis=None, keepdims=False):
    return Max(axis, keepdims)(x)

def min(x, axis=None, keepdims=False):
    return Min(axis, keepdims)(x)

In [66]:
print(x)
np.clip(x, 2, 3)

[[1 2 3]
 [3 5 4]]


array([[2, 2, 3],
       [3, 3, 3]])

In [70]:
mask = (x >= 2) * (x <=3)
mask

array([[False,  True,  True],
       [ True, False, False]])

In [71]:
y = [[1, 1, 1],
     [2, 2, 2]]
y * mask

array([[0, 1, 1],
       [2, 0, 0]])

In [None]:
class Clip(Function):
    def __init__(self, x_min, x_max):
        self.x_min = x_min
        self.x_max = x_max
        
    def forward(self, x):
        y = np.clip(x, self.x_min, self.x_max)
        return y
    
    def backward(self, gy):
        x, = self.inputs
        mask = (x.data >= self.x_min) * (x.data <= self.x_max)
        gx = gy * mask
        return gx
    
def clip(x, x_min, x_max):
    return Clip(x_min, x_max)(x)

### Again! general `Softmax`

## $p_{k} = \frac{exp(y_{k})}{\sum^{n}_{i=1} exp(y_{i})}$

In [84]:
x = np.array([[0.2, -0.4, 0.4],
              [0.3, -0.5, 0.7]])

x_max = x.max(axis=1, keepdims=True)
x_after_max = x - x_max
print(x)
print(x_max)
print(x_after_max)

y = np.exp(x_after_max)
y_sum = y.sum(axis=1, keepdims=True)
print(y / y_sum)

y = np.exp(x)
y_sum = y.sum(axis=1, keepdims=True)
print(y / y_sum)

[[ 0.2 -0.4  0.4]
 [ 0.3 -0.5  0.7]]
[[0.4]
 [0.7]]
[[-0.2 -0.8  0. ]
 [-0.4 -1.2  0. ]]
[[0.36098289 0.19811161 0.4409055 ]
 [0.34000264 0.15277303 0.50722433]]
[[0.36098289 0.19811161 0.4409055 ]
 [0.34000264 0.15277303 0.50722433]]


### `i`th input $y_{i}$ `j`'th output $p_{j}$

### When `i` = `j`

## $\frac{ \partial{p_{i}} }{ \partial{y_{i}} } =
\frac{ \partial{ \frac{exp(y_{i})}{\sum^{n}_{k=1} exp(y_{k})} } }
     { \partial{y_{i}} }$
     
---
$y = \frac{f(x)}{g(x)}
= \frac{\partial{y}}{\partial{x}}
= \frac{f'(x)g(x) - f(x)g'(x)}{g(x)^{2}}$
---
---

## $\frac{ \partial{p_{i}} }{ \partial{y_{i}} } =
\frac{ exp(y_{i})\sum^{n}_{k=1} exp(y_{k})\ -\ exp(y_{i})exp(y_{i}) }
     { (\sum^{n}_{k=1} exp(y_{k}))^{2} }$
## $\quad\; =
\frac{ exp(y_{i}) \big[ \sum^{n}_{k=1} exp(y_{k})\ -\ exp(y_{i}) \big] }
     { (\sum^{n}_{k=1} exp(y_{k}))^{2} }$
## $\quad\; =
\frac{ exp(y_{i}) }
     { \sum^{n}_{k=1} exp(y_{k}) }
\frac{ \sum^{n}_{k=1} exp(y_{k})\ -\ exp(y_{i}) }
     { \sum^{n}_{k=1} exp(y_{k}) }
$
## $\quad\; =
\frac{ exp(y_{i}) }
     { \sum^{n}_{k=1} exp(y_{k}) }
\bigg(
1 - 
\frac{ exp(y_{i}) }
     { \sum^{n}_{k=1} exp(y_{k}) }
\bigg)
$
---

## $\therefore \frac{ \partial{p_{i}} }{ \partial{y_{i}} } = p_{i}(1 - p_{i})$

### `i`th input $y_{i}$ `j`'th output $p_{j}$

### When `i` $\neq$ `j`

## $\frac{ \partial{p_{j}} }{ \partial{y_{i}} } =
\frac{ \partial{ \frac{exp(y_{j})}{\sum^{n}_{k=1} exp(y_{k})} } }
     { \partial{y_{i}} }$
     
---
$y = \frac{f(x)}{g(x)}
= \frac{\partial{y}}{\partial{x}}
= \frac{f'(x)g(x) - f(x)g'(x)}{g(x)^{2}}$
---
---

## $\frac{ \partial{p_{i}} }{ \partial{y_{i}} } =
\frac{ 0\sum^{n}_{k=1} exp(y_{k})\ -\ exp(y_{j})exp(y_{i}) }
     { (\sum^{n}_{k=1} exp(y_{k}))^{2} }$
## $\quad\; =
\frac{ - exp(y_{j})exp(y_{i}) }
     { (\sum^{n}_{k=1} exp(y_{k}))^{2} }$
## $\quad\; =
-
\frac{ exp(y_{j}) }
     { \sum^{n}_{k=1} exp(y_{k}) }
\frac{ exp(y_{i}) }
     { \sum^{n}_{k=1} exp(y_{k}) }
$

---

## $\therefore \frac{ \partial{p_{j}} }{ \partial{y_{i}} } = -p_{j}p_{i}$

## $\frac{ \partial{p} }{ \partial{y} }
=
\begin{bmatrix}
p_{1}(1 - p_{1}) & -p_{2}p_{1}      & -p_{3}p_{1} \\
-p_{1}p_{2}      & p_{2}(1 - p_{2}) & -p_{3}p_{2} \\
-p_{1}p_{3}      & -p_{2}p_{3}      & p_{3}(1 - p_{3}) 
\end{bmatrix}
$

### when we look at gradient... for $y_{i}$

### $p_{1}(1 - p_{1}) - p_{2}p_{1} - p_{3}p_{1}
= p_{1}(1 - p_{1} - p_{2} - p_{3}) = p_{1}(1 - 1) = 0$

### All gradient turns to zero :0

In [None]:
class Softmax(Function):
    def __init__(self, axis=1):
        self.axis = axis
        
    def forward(self, x):
        y = x - x.max(axis=self.axis, keepdims=True)
        y = np.exp(y)
        y /= y.sum(axis=self.axis, keepdims=True)
        return y
    
    def backward(self, gy):
        y = self.outputs[0]()
        gx = y * gy
        sumdx = gx.sum(axis=self.axis, keepdims=True)
        gx = gx - (y * sumdx)
        return gx
    
def softmax(x, axis=1):
    return Softmax(axis)(x)

In [2]:
from dezero import Variable
import dezero.functions as F

x = Variable(np.array([[0.2, -0.4],
                       [0.3, -0.5]]))
p = F.softmax(x)
print(p)

Variable([[0.64565631 0.35434369]
          [0.68997448 0.31002552]])


In [3]:
p.backward()

In [4]:
x.grad

Variable([[1.11022302e-16 5.55111512e-17]
          [0.00000000e+00 0.00000000e+00]])

We can see that the grad is **0** :0

## `Cross Entropy Loss` Function

### $L = - {\sum\limits^{n}_{k=1} t_{k}log p_{k}}$

when $t = (0, 0, 1)$ and $p = (p_{0}, p_{1}, p_{2})$

### $L = -log p_{2}$

---

So we can say...

### $L = -log p[t]$
- Slicing!


In [10]:
from dezero import Variable, as_variable
import dezero.functions as F

def softmax_cross_entropy_simple(x, t):
    x, t = as_variable(x), as_variable(t)
    N = x.shape[0]
    
    p = F.softmax(x)
    p = F.clip(p, 1e-15, 1.0)
    log_p = F.log(p)
    tlog_p = log_p[np.arange(N), t.data]
    y = -1 * F.sum(tlog_p) / N
    return y    

In [11]:
x = np.array([[0.2, 0.3, 0.5],
              [0.6, 0.1, 0.3]])
t = np.array([2, 0])
softmax_cross_entropy_simple(x, t)

Variable(0.8965588080868547)

In [17]:
x = Variable(np.array([[0.2, 0.3, 5],
                       [12, 0.1, 0.3]]))
t = Variable(np.array([2, 0]))
loss = softmax_cross_entropy_simple(x, t)
loss

Variable(0.008595870610124774)

In [18]:
loss.backward()
x.grad

Variable([[ 4.04479731e-03  4.47019236e-03 -8.51498967e-03]
          [-7.54199822e-06  3.39515119e-06  4.14684703e-06]])

We can check that the gradient is back with `Cross Entropy`!

## `Softmax Cross Entropy Loss`!

---

### At `Softmax`

### `i` = `j`

## $\frac{ \partial{p_{i}} }{ \partial{y_{i}} } = p_{i}(1 - p_{i})$

### `i` $\neq$ `j`

## $\frac{ \partial{p_{j}} }{ \partial{y_{i}} } = -p_{j}p_{i}$

---

### At `Cross Entropy`

### $L = - {\sum\limits^{n}_{k=1} t_{k}log p_{k}}$

## $\frac{ \partial{L} }{ \partial{y_{i}} } =
\frac{ \partial{ \big( - \sum^{n}_{k=1} t_{k}log p_{k} } \big) }
     { \partial{y_{i}} }$
     
## $\quad\; =
-
\sum^{n}_{k=1} t_{k} \frac{\partial{log p_{k}}}{\partial{y_{i}}}
$

## $\quad\; =
-
\sum^{n}_{k=1} t_{k} \frac{1}{p_{k}} \frac{\partial{p_{k}}}{\partial{y_{i}}}
$

---

### Softmax Gradient $\frac{\partial{log p_{k}}}{\partial{y_{i}}}$ works different when $i = j$ and $i \neq j$

### $- \sum^{n}_{k=1} t_{k} \frac{1}{p_{k}} \frac{\partial{p_{k}}}{\partial{y_{i}}}
= - \frac{t_{i}}{p_{i}} p_{i}(1 - p_{i})
- \sum^{n}_{k \neq i} \frac{t_{k}}{p_{k}} (-p_{k}p_{i})$

### $\qquad\qquad\qquad = 
- t_{i} + t_{i}p_{i} + \sum^{n}_{k \neq i} t_{k}p_{i}
$

### $\qquad\qquad\qquad = 
- t_{i} + t_{i}p_{i} - t_{i}p_{i} + t_{i}p_{i} + \sum^{n}_{k \neq i} t_{k}p_{i}
$

### $\qquad\qquad\qquad = - t_{i} + \sum^{n}_{k=1} t_{k}p_{i}$
### $\qquad\qquad\qquad = - t_{i} + p_{i}\sum^{n}_{k=1} t_{k}$
### $\qquad\qquad\qquad = - t_{i} + p_{i}$
### $\qquad\qquad\qquad = p_{i} - t_{i}$

---

We can see the Gradient is pretty simple :)

### It's just $p - t$ = softmax(x) $ -\ t$

---

- `softmax`

### $p_{k} = \frac{exp(y_{k})}{\sum^{n}_{i=1} exp(y_{i})}$

- `log softmax`

### $log(p_{k}) = log(exp(y_{k})) - log(\sum^{n}_{i=1} exp(y_{i}))$
### $ \qquad\;\ = y_{k} - log(\sum^{n}_{i=1} exp(y_{i}))$

In [None]:
def log_softmax(x, axis=1):
    x_max = x.max(axis=axis, keepdims=True)
    y = x - x_max

    y = np.exp(y)
    y_sum = y.sum(axis=axis, keepdims=True)
    
    log_y_sum = np.log(y_sum)
    log_z = x_max + log_y_sum
    
    log_p = x - log_z

    return log_p

Ravel helps to make all the elements inside to be `1d array`!

In [22]:
t = np.array([[2], [0], [1], [0]])
t.ravel()

array([2, 0, 1, 0])

### $L = - {\sum\limits^{n}_{k=1} t_{k}log p_{k}}$

## $\frac{ \partial{L} }{ \partial{y_{i}} } = p_{i} - t_{i}$

In [26]:
np.eye(3)[t.ravel()]

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [None]:
class SoftmaxCrossEntropy(Function):
    def forward(self, x, t):
        N = x.shape[0]
        log_p = log_softmax(x)
        log_p = log_p[np.arange(N), t.ravel()]
        y = -log_p.sum() / np.float32(N)
        return y
    
    def backward(self, gy):
        x, t = self.inputs
        N, CLS_NUM = x.shape
        
        gy = gy / N
        
        # p_i
        y = softmax(x)
        
        # t_i
        t_onehot = np.eye(CLS_NUM, dtype=t.dtype)[t.data]
        
        # p_i - t_i
        gx = (y - t_onehot) * gy
        
        return gx
    
def softmax_cross_entropy(x, t):
    return SoftmaxCrossEntropy()(x, t)

## Test!

In [21]:
import numpy as np

from dezero import Variable
from dezero import optimizers
import dezero.functions as F
from dezero.models import MLP

lr = 0.01

model = MLP((10, 3))
optimizer = optimizers.Adam(lr).setup(model)

x = Variable(np.array([[0.2, -0.4],
                       [0.3, 0.5],
                       [1.3, -3.2],
                       [2.1, 0.3]]))
y = Variable(np.array([2, 0, 1, 0]))

iters = 1000

for i in range(iters):
    y_pred = model(x)
    loss = F.softmax_cross_entropy(y_pred, y)
    
    model.cleargrads()
    loss.backward()
    optimizer.update()
    
    if i % 100 == 0:
        print(loss)
    

Variable(1.2923647287521973)
Variable(0.8111358663141301)
Variable(0.40483469446083364)
Variable(0.2193841379002533)
Variable(0.12876382875701547)
Variable(0.08055995411287575)
Variable(0.05366404403450009)
Variable(0.03777797489187806)
Variable(0.027826497488328217)
Variable(0.021253890618229243)


In [22]:
y_pred

Variable([[ 0.0859358  -0.16529972  4.09196037]
          [ 3.98952045 -2.77720881  0.13626685]
          [-3.38982648  4.90290326  0.46156244]
          [ 4.97053061 -2.04978722 -3.63391973]])