# Fake Dataset

- generate 128 dots around a stright line.
- plot the fake dataset
- we know $w$ and $b$, but we pretend we know nothing

In [None]:
import numpy as np
import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import util


%matplotlib inline

In [None]:
def secret_dots():
    """
    generate 128 dots around y = w * x + b
    """
    w = 0.9
    b = 0.2

    xs = np.random.uniform(low=0.0, high=5.0, size=(128, 1))
    ys = xs * w + b + np.random.uniform(low=-0.05, high=0.05, size=xs.shape)
    
    return xs, ys

eigens, labels = secret_dots()

plt.clf()
plt.figure(figsize=(10, 10), dpi=50)
plt.title('y = wx + b')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(eigens, labels, '.')
plt.show()

# The Loss Function

![](./assets/001_loss.png)

We now have some dots (x = eigens, y = labels).

1. We **guess** all the dots fall on a stright line: $y = wx + b$. But we do not knwon $w, b$ yet.
So we assign some values to $w, b$ randomly. Our goal is to find good $w, b$ so that all data falls on that line.

2. We select a dot $(x_i, y_i)$ from the secret dataset.

3. Appling $(x_i, y_i)$ to $y = wx + b$ gives us $z = wx_i + b$.

4. Apparently, $z_i$ is far away from $y_i$ because $w, b$ are not **the chosen one**. We call the distance between $z_i$ & $y_i$ **loss**.

What we want is to minimize the distance between $z_i$ and $y_i$.

**We wish** : change $w$ and $b$ slightly so that $z_i$ is also changed slightly to **decrease the loss**.

In [None]:
secret_w = 0.9
secret_b = 0.2

# a closer point introduces lower loss
near_x = 3.0
near_y = 3.0
near_z = near_x * secret_w + secret_b
near_loss = (near_z - near_y) ** 2

# a further point introduces higher loss
far_x = 1.0
far_y = 4.0
far_z = far_x * secret_w + secret_b
far_loss = (far_z - far_y) ** 2

plt.clf()
plt.figure(figsize=(10, 10), dpi=50)
plt.plot(eigens, labels, '.')

plt.plot(near_x, near_y, 'ro', label='loss[near]: {}'.format(near_loss))

plt.plot(far_x, far_y, 'go', label='loss[far]: {}'.format(far_loss))

plt.legend(loc='lower right', prop={'size': 16})
plt.show()

In [None]:
# NOTE: we do not know w & b, assign some random number to them
w = 0.6
b = 0.5

# NOTE: chose a dot
max_x = np.argmax(eigens)

x1, y1 = eigens[max_x, 0], labels[max_x, 0]

# NOTE: evaluate w and b to get z1
z1 = w * x1 + b

# NOTE: if z1 equals to y1, loss is 0 and we have nothing to improve.
#       but z1 does not equal to y1 because w & b are random numbers.
loss = (z1 - y1) ** 2

# NOTE: generate some data to plot the loss function over w
loss_ws = (np.arange(201, dtype=np.float) - 100.0) / 200.0 + 0.85
loss_wl = (loss_ws * x1 + b - y1) ** 2

# NOTE: generate some data to plot the loss function over b
loss_bs = (np.arange(201, dtype=np.float) - 100.0) / 50.0 + 1.8
loss_bl = (w * x1 + loss_bs - y1) ** 2

# plot the loss function and loss of z1
plt.clf()
plt.figure(figsize=(20, 10), dpi=50)

# NOTE: plot loss v.s. w
plt.subplot(1, 2, 1)
plt.title('w')
plt.plot(loss_ws, loss_wl, '-')
plt.plot(w, loss, 'ro')

# NOTE: plot loss v.s. b
plt.subplot(1, 2, 2)
plt.title('b')
plt.plot(loss_bs, loss_bl, '-')
plt.plot(b, loss, 'go')

plt.show()

# Differential

$
\begin{align}
\mathcal L = loss = (z_1 - y_1)^2
\end{align}
$

if we increase $z_1$ a little bit, what will happen to $\mathcal L$ ?

$
\begin{align}
slop(\mathcal L, z_1) = \frac{\partial \mathcal L}{\partial z_1} = 2.0 \times (z_1 - y_1)
\end{align}
$

In [None]:
loss = (z1 - y1) ** 2

# the slop at z1
dl_dz = 2.0 * (z1 - y1)

# NOTE: if we want to change loss, how much should we change w?
dl_dw = dl_dz * x1

# NOTE: if we want to change loss, how much should we change b?
dl_db = dl_dz

# NOTE: generate some data to plot the loss function over w
loss_ws = (np.arange(201, dtype=np.float) - 100.0) / 200.0 + 0.85
loss_wl = (loss_ws * x1 + b - y1) ** 2

# NOTE: generate some data to plot the loss function over b
loss_bs = (np.arange(201, dtype=np.float) - 100.0) / 50.0 + 1.8
loss_bl = (w * x1 + loss_bs - y1) ** 2

# plot the loss function and loss of z1
plt.clf()
plt.figure(figsize=(20, 10), dpi=50)

# NOTE: plot loss v.s. w
plt.subplot(1, 2, 1)
plt.title('loss / w')
plt.xlabel('w')
plt.ylabel('loss')
plt.plot(loss_ws, loss_wl, '-')
plt.plot(w, loss, 'ro')
plt.arrow(w, loss, 0.1, 0.1 * dl_dw, head_width=0.05, head_length=0.1, fc='k', ec='k')

# NOTE: plot loss v.s. b
plt.subplot(1, 2, 2)
plt.title('loss / b')
plt.xlabel('b')
plt.ylabel('loss')
plt.plot(loss_bs, loss_bl, '-')
plt.plot(b, loss, 'go')
plt.arrow(b, loss, 0.1, 0.1 * dl_db, head_width=0.05, head_length=0.1, fc='k', ec='k')

plt.show()

# Learn W & b through Chain Rule

$
\begin{align}
z_1 &= w \times x_1 + b \\
\mathcal L &= (z_1 - y_1)^2 \\
\frac{\partial z_1}{\partial w} &= x_1 \\
\frac{\partial z_1}{\partial b} &= 1 \\
\frac{\partial \mathcal L}{\partial z_1} &= \frac{\partial {(z_1 - y_1)^2}}{\partial z_1} = 2.0 \times (z_1 - y_1) \\
\frac{\partial \mathcal L}{\partial w} &= \frac{\partial \mathcal L}{\partial z_1} \frac{\partial z_1}{\partial w} = 2.0 \times (z_1 - y_1) \times x_1 \\
\frac{\partial \mathcal L}{\partial b} &= \frac{\partial \mathcal L}{\partial z_1} \frac{\partial z_1}{\partial b} = 2.0 \times (z_1 - y_1) \\
\end{align}
$

the slop points to the valley of loss, let's move $w, b$ to that direction

In [None]:
old_w, old_b = w, b

# NOTE: what is the label if x is x1 wrt old_w & old_b?
old_z1 = old_w * x1 + old_b

# NOTE: loss
old_loss = (old_z1 - y1) ** 2

# NOTE: if we want to change loss, how much should we change z?
dl_dz = 2.0 * (old_z1 - y1)

# NOTE: if we want to change loss, how much should we change w?
dl_dw = dl_dz * x1

# NOTE: if we want to change loss, how much should we change b?
dl_db = dl_dz

# NOTE: update w & z
#       why substract?
#       change learning rate to see the difference
learning_rate = 0.005

new_w = old_w - learning_rate * dl_dw
new_b = old_b - learning_rate * dl_db

# NOTE: update loss based on new weight and new bias
new_z1 = new_w * x1 + new_b

# NOTE: if w & b are improved, the loss should have been lowered.
new_w_loss = (new_w * x1 + b - y1) ** 2
new_b_loss = (w * x1 + new_b - y1) ** 2

# plot the loss function and loss of z1
plt.clf()
plt.figure(figsize=(20, 10), dpi=50)

# NOTE: plot loss v.s. w
plt.subplot(1, 2, 1)
plt.title('loss / w')
plt.xlabel('w')
plt.ylabel('loss')
plt.plot(loss_ws, loss_wl, '-')
plt.plot(w, loss, 'ro')
plt.plot(new_w, new_w_loss, 'go')
plt.arrow(w, loss, 0.1, 0.1 * dl_dw, head_width=0.05, head_length=0.1, fc='k', ec='k')

# NOTE: plot loss v.s. b
plt.subplot(1, 2, 2)
plt.title('loss / b')
plt.xlabel('b')
plt.ylabel('loss')
plt.plot(loss_bs, loss_bl, '-')
plt.plot(b, loss, 'go')
plt.plot(new_b, new_b_loss, 'ro')
plt.arrow(b, loss, 0.1, 0.1 * dl_db, head_width=0.05, head_length=0.1, fc='k', ec='k')

plt.show()


In [None]:
plt.clf()
plt.figure(figsize=(10, 10), dpi=50)

# NOTE: plot all the secret dots
plt.plot(eigens, labels, '.')

# NOTE: plot the stright line of old_w & old_b
plt.plot([0.0, 5.0], [0.0 * old_w + old_b, 5.0 * old_w + old_b], 'ro-')

# NOTE: plot the stright line of new_w & new_b
plt.plot([0.0, 5.0], [0.0 * new_w + new_b, 5.0 * new_w + new_b], 'go-')

plt.show()

# Put Everything Together

- change the learning rate to see the difference. why is it important?
- change the step to see the difference. why is it important?

In [None]:
# TODO: change learning_rate to: -0.0001, 0.0, 0.1, 0.01, 0.001
#       which one is better?
learning_rate = 0.01

# TODO: change num_steps to: 1, 10, 100, 1000,
#       which one is better?
num_steps = 10

# TODO: change learnt_w or learnt_b to large value and see the results
learnt_w, learnt_b = 0.6, 0.5

losses = []

for _ in xrange(num_steps):
    # NOTE: pick a dot
    random_i = np.random.randint(eigens.size)
    
    temp_x = eigens[random_i]
    temp_y = labels[random_i]
    
    # NOTE: evaluate learnt_w & learnt_b on the picked dot
    temp_z = learnt_w * temp_x + learnt_b

    # NOTE: get the loss
    temp_loss = (temp_z - temp_y) ** 2

    # NOTE: record the loss
    losses.append(temp_loss)

    # NOTE: if temp_z is changed, what will happened to loss?
    dl_dz = 2.0 * (temp_z - temp_y)

    # NOTE: if w is changed, what will happened to loss?
    dl_dw = dl_dz * temp_x

    # NOTE: if b is changed, what will happened to loss?
    dl_db = dl_dz

    # NOTE: change learnt_w & learnt_b a little bit
    learnt_w = learnt_w - learning_rate * dl_dw
    learnt_b = learnt_b - learning_rate * dl_db

plt.clf()
plt.figure(figsize=(20, 10), dpi=50)

# NOTE: plot the learnt stright line
plt.subplot(1, 2, 1)
plt.plot(eigens, labels, '.')
plt.plot([0.0, 5.0], [0.0 * w + b, 5.0 * w + b], 'r-')
plt.plot([0.0, 5.0], [0.0 * learnt_w + learnt_b, 5.0 * learnt_w + learnt_b], 'b-')

# NOTE: plot the loss of each iteration
plt.subplot(1, 2, 2)
plt.plot(losses)

plt.show()

# Train a Linear Regression Model on KKTV Data Game 17.11 Dateset

* train_eigens: (45728 users, 32 weeks * 28 slots)
* train_labels: (45728 users, 28 slots of 33-th week)
* valid_eigens: (11431 users, 32 weeks * 28 slots)
* valid_labels: (11431 users, 28 slots of 33-th week)

We want to train a model with (train_eigens, train_labels), then validate the performance of the model on (valid_eigens, valid_labels)

In [None]:
dataset = np.load('./datasets/v0_eigens.npz')

train_data_size = dataset['train_eigens'].shape[0]
valid_data_size = train_data_size / 5
train_data_size = train_data_size - valid_data_size

indices = np.arange(train_data_size + valid_data_size)

train_data = dataset['train_eigens'][indices[:train_data_size]]
valid_data = dataset['train_eigens'][indices[train_data_size:]]

train_eigens = train_data[:, :-28]
train_labels = train_data[:, -28:]
valid_eigens = valid_data[:, :-28]
valid_labels = valid_data[:, -28:]

print 'train_eigens.shape = {}'.format(train_eigens.shape)
print 'train_labels.shape = {}'.format(train_labels.shape)
print 'valid_eigens.shape = {}'.format(valid_eigens.shape)
print 'valid_labels.shape = {}'.format(valid_labels.shape)

In [None]:
# NOTE: plot a cherry picked user
train_user_0_eigens = train_eigens[3]
train_user_0_labels = train_labels[3]

train_user_0_eigens = train_user_0_eigens.reshape(-1, 28)
train_user_0_labels = train_user_0_labels.reshape(-1, 28)

gs = matplotlib.gridspec.GridSpec(2, 1, height_ratios=[32, 1])

plt.clf()
plt.figure(figsize=(20, 10), dpi=50)
plt.subplot(gs[0])
plt.imshow(train_user_0_eigens, cmap='gray')
plt.subplot(gs[1])
plt.imshow(train_user_0_labels, cmap='gray')

# x W + b = Y

In [None]:
train_user_0_eigens = train_user_0_eigens.reshape(1, -1)
train_user_0_labels = train_user_0_labels.reshape(1, -1)

random_weights = np.random.normal(size=(896, 28))
random_biases = np.random.normal(size=(1, 28))

gs = matplotlib.gridspec.GridSpec(1, 4, width_ratios=[16, 8, 4, 4])

plt.clf()
plt.figure(figsize=(20, 10), dpi=50)
plt.subplot(gs[0])
plt.imshow(train_user_0_eigens, aspect=100.0, cmap='gray')
plt.subplot(gs[1])
plt.imshow(random_weights, aspect='auto', cmap='gray')
plt.subplot(gs[2])
plt.imshow(random_biases, aspect=20.0, cmap='gray')
plt.subplot(gs[3])
plt.imshow(train_user_0_labels, aspect=20.0, cmap='gray')

# Linear Regression

![](./assets/002_nn.png)

$
\begin{align}
x_i = \begin{bmatrix} x_{i, 0}, \cdots , x_{i, 895} \end{bmatrix}
\end{align}
$

$
\begin{align}
y_i = \begin{bmatrix} y_{i, 0}, \cdots , y_{i, 27} \end{bmatrix}
\end{align}
$

$
\begin{align}
w = \begin{pmatrix}w_{0,0} && \cdots && w_{0,27} \\ \vdots && \ddots && \vdots \\ w_{895,0} && \cdots && w_{895,27} \end{pmatrix}
\end{align}
$

$
\begin{align}
b = \begin{bmatrix} b_{0}, \cdots , b_{27} \end{bmatrix}
\end{align}
$

* $x_i$ : $user_i$ 's features (32 weeks playback log).
* $y_i$ : $user_i$ 's label (33-th week playback log).
* we want to find (train) good $w$ and $b$ so that for all users, $x_i \cdot w + b = y_i $

Expected result: AUC ~ 0.773074102425

In [None]:
batch_size = 256
learning_rate = 0.00001

# NOTE: initialize w & b
w = (np.random.rand(train_eigens.shape[1], 28) * 2.0 - 1.0) * 0.01
b = np.zeros((1, 28))

# NOTE: get the AUC before training
valid_guesss = np.dot(valid_eigens, w) + b

auc = util.auc(valid_guesss.flatten(), valid_labels.flatten())

print 'auc[before]: {}'.format(auc)

for step, eigens, labels in util.mini_batches(train_eigens, train_labels, batch_size, False):
    # NOTE: tensor version of y = xw +b
    y = np.dot(eigens, w) + b
    
    # NOTE: tensor version of loss
    loss = np.sum(np.square(y - labels), axis=1)

    if step % 5000 == 0:
        print 'loss[{:>8}]: {}'.format(step, np.mean(loss))

    # NOTE: tensor version of chain rule
    dl_dy = 2.0 * (y - labels)
    dl_dw = np.mean(np.dot(eigens.T, dl_dy), axis=0)
    dl_db = np.mean(dl_dy, axis=0)
    
    # NOTE: update w & b
    w = w - learning_rate * dl_dw
    b = b - learning_rate * dl_db
    
    if step == 30000:
        break

# NOTE: get the AUC after training
valid_guesss = np.dot(valid_eigens, w) + b

auc = util.auc(valid_guesss.flatten(), valid_labels.flatten())

print 'auc[after]: {}'.format(auc)

In [None]:
image_inception = mpimg.imread('./datasets/inception.png')

plt.clf()
plt.figure(figsize=(20, 10), dpi=50)
plt.imshow(image_inception)

# One Hidden Layer

![](./assets/003_nn.png)

$
\begin{align}
W_1 \cdot x + b_1 = h_1 \\
W_2 \cdot h_1 + b_2 = y
\end{align}
$

In [None]:
batch_size = 256
learning_rate = 0.00001

w1 = (np.random.rand(train_eigens.shape[1], 128) * 2.0 - 1.0) * 0.01
b1 = np.zeros((1, 128))
w2 = (np.random.rand(128, 28) * 2.0 - 1.0) * 0.01
b2 = np.zeros((1, 28))

y1 = np.dot(valid_eigens, w1) + b1
y1[y1 < 0.0] = 0.0

y2 = np.dot(y1, w2) + b2
y2 = 1.0 / (1.0 + np.exp(-y2))

valid_guesss = y2

auc = util.auc(valid_guesss.flatten(), valid_labels.flatten())

print 'auc[before]: {}'.format(auc)

for step, eigens, labels in util.mini_batches(train_eigens, train_labels, batch_size, False):
    # NOTE: 1st hidden layer
    y1 = np.dot(eigens, w1) + b1
    
    # NOTE: relu
    y1[y1 < 0.0] = 0.0

    # NOTE: output layer
    y2 = np.dot(y1, w2) + b2
   
    loss = np.sum(np.square(y2 - labels), axis=1)

    if step % 5000 == 0:
        print 'loss[{:>8}]: {}'.format(step, np.mean(loss))
    
    dl_dy2 = 2.0 * (y2 - labels)
    dl_dw2 = np.mean(np.dot(y1.T, dl_dy2), axis=0)
    dl_db2 = np.mean(dl_dy2, axis=0)

    # NOTE: relu
    dl_dy1 = np.dot(dl_dy2, w2.T) * (y1 > 0.0)
    
    # TODO: calculate dl_dw1
    dl_dw1 = None
    
    # TODO: calculate dl_db1
    dl_db1 = None
    
    w2 = w2 - learning_rate * dl_dw2
    b2 = b2 - learning_rate * dl_db2
    
    # TODO: update w1 & b1
    w1 = None
    b1 = None
    
    if step == 30000:
        break
        
y1 = np.dot(valid_eigens, w1) + b1
y1[y1 < 0.0] = 0.0

y2 = np.dot(y1, w2) + b2
y2 = 1.0 / (1.0 + np.exp(-y2))

valid_guesss = y2

auc = util.auc(valid_guesss.flatten(), valid_labels.flatten())

# NOTE: ~0.81523530014
print 'auc[after]: {}'.format(auc)

# Activation Function: ReLU (Rectified Linear Unit)

![](./assets/004_relu.png)

# Puzzle Time

Consider the linear function $W \cdot x + b = y$. If the best solution is to copy the 32-th week activities, what should $W$ & $b$ look like?

In [None]:
train_user_0_eigens = train_eigens[3].reshape(1, -1)
train_user_0_labels = train_labels[3].reshape(1, -1)

w_get_32 = np.zeros((896, 28))
b_get_32 = np.zeros((1, 28))

# TODO: overwrite w_get_32 & b_get_32 so that the results always equal to 32-th week.

z = np.matmul(train_user_0_eigens, w_get_32) + b_get_32

print np.sum(z == train_user_0_eigens[0, -28:]) == 28

gs = matplotlib.gridspec.GridSpec(2, 1, height_ratios=[896, 1])

plt.clf()
plt.figure(figsize=(20, 20), dpi=50)
plt.subplot(gs[0])
plt.imshow(w_get_32, aspect='auto', cmap='gray')
plt.subplot(gs[1])
plt.imshow(b_get_32, aspect='auto', cmap='gray')