# 03 - Multi-variable Linear Regression

<img width="200" src="https://i.imgur.com/hbPVe1T.png">


In [2]:
import tensorflow.compat.v1 as tf
import numpy as np

tf.enable_eager_execution()
tf.__version__

'2.6.0'

# Multi-variable linear regression
Predicting exam score - regression using three inputs (x1, x2, x3)

x1 (quiz 1) | x2 (quiz 2) | x3 (mid 1) | Y (final)
---- | ---- | ----| ----
73 | 80 | 75 | 152
93 | 88 | 93 | 185
89 | 91 | 90 | 180
96 | 98 | 100 | 196
73 | 66 | 70 | 142


Test Scores for General Psychology ( https://goo.gl/g2T8Kp )

# Matrix multiplication

## dot product(=scalar product, 내적)
<img src="https://www.mathsisfun.com/algebra/images/matrix-multiply-a.svg" >


https://www.mathsisfun.com/algebra/matrix-multiplying.html

# Multi-feature regression

### Hypothesis

$$ H(x) = w x + b $$

$$ H(x_1, x_2, x_3) = w_1 x_1 + w_2 x_2 + w_3 x_3 + b $$

# Hypothesis using matrix

$$ H(x_1, x_2, x_3) = \underline{w_1 x_1 + w_2 x_2 + w_3 x_3} + b $$

$$ w_1 x_1 + w_2 x_2 + w_3 x_3 $$ 

$$ \begin{pmatrix} w_{ 1 } & w_{ 2 } & w_{ 3 } \end{pmatrix}\cdot \begin{pmatrix} x_{ 1 } \\ x_{ 2 } \\ x_{ 3 } \end{pmatrix} $$

$$ WX $$ (W, X 는 matrix)

# Hypothesis without b

$$ H(x_1, x_2, x_3) = w_1 x_1 + w_2 x_2 + w_3 x_3 + b$$

$$ = b + w_1 x_1 + w_2 x_2 + w_3 x_3 $$

$$ = \begin{pmatrix} b & x_{ 1 } & x_{ 2 } & x_{ 3 } \end{pmatrix}\cdot \begin{pmatrix} 1 \\ w_{ 1 } \\ w_{ 2 } \\ w_{ 3 } \end{pmatrix} $$

$$ = XW $$



# Hypothesis using matrix 

### Many x instances

$$ \begin{pmatrix} x_{ 11 } & x_{ 12 } & x_{ 13 } \\ x_{ 21 } & x_{ 22 } & x_{ 23 } \\ x_{ 31 } & x_{ 32 } & x_{ 33 }\\ x_{ 41 } & x_{ 42 } & x_{ 43 }\\ x_{ 51 } & x_{ 52 } & x_{ 53 }\end{pmatrix} \cdot \begin{pmatrix} w_{ 1 } \\ w_{ 2 } \\ w_{ 3 } \end{pmatrix}=\begin{pmatrix} x_{ 11 }w_{ 1 }+x_{ 12 }w_{ 2 }+x_{ 13 }w_{ 3 } \\ x_{ 21 }w_{ 1 }+x_{ 22 }w_{ 2 }+x_{ 23 }w_{ 3 }\\ x_{ 31 }w_{ 1 }+x_{ 32 }w_{ 2 }+x_{ 33 }w_{ 3 } \\ x_{ 41 }w_{ 1 }+x_{ 42 }w_{ 2 }+x_{ 43 }w_{ 3 } \\ x_{ 51 }w_{ 1 }+x_{ 52 }w_{ 2 }+x_{ 53 }w_{ 3 } \end{pmatrix} $$

$$ [5, 3] \cdot [3, 1] = [5, 1] $$

$$ H(X) = XW $$

5는 데이터(instance)의 수, 3은 변수(feature)의 수, 1은 결과

# Hypothesis using matrix (n output)

$$ [n, 3] \cdot [?, ?] = [n, 2] $$

$$ H(X) = XW $$

* n은 데이터(instance)의 개수, 2는 결과 값의 개수로 주어진다.
* 이때, W [?, ?] ⇒ [3, 2]

# WX vs XW

### Theory (Lecture) :
 $$ H(x) = Wx + b  $$

### TensorFlow (Implementation) :

$$ H(X) = XW $$

# Simple Example (2 variables)

x1 | x2 | y
---- | ---- | ----
1  |  0  |  1
0  |  2  |  2
3  |  0  |  3
0  |  4  |  4
5  |  0  |  5

In [2]:
tf.set_random_seed(0)  # for reproducibility

In [5]:
x1_data = [1, 0, 3, 0, 5]
x2_data = [0, 2, 0, 4, 0]
y_data  = [1, 2, 3, 4, 5]

W1 = tf.Variable(tf.random_uniform([1], -10.0, 10.0))
W2 = tf.Variable(tf.random_uniform([1], -10.0, 10.0))
b  = tf.Variable(tf.random_uniform([1], -10.0, 10.0))

learning_rate = tf.Variable(0.001)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = W1 * x1_data + W2 * x2_data + b
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))
    W1_grad, W2_grad, b_grad = tape.gradient(cost, [W1, W2, b])
    W1.assign_sub(learning_rate * W1_grad)
    W2.assign_sub(learning_rate * W2_grad)
    b.assign_sub(learning_rate * b_grad)

    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.6f}".format(
          i, cost.numpy(), W1.numpy()[0], W2.numpy()[0], b.numpy()[0]))


    0 |  68.246712 |     1.3820 |    -1.0251 |  -5.506783
   50 |  27.503042 |     1.8613 |     0.1592 |  -4.936086
  100 |  12.634786 |     2.0407 |     0.9074 |  -4.581374
  150 |   6.841758 |     2.0925 |     1.3797 |  -4.347348
  200 |   4.468609 |     2.0931 |     1.6765 |  -4.182670
  250 |   3.447158 |     2.0750 |     1.8611 |  -4.058953
  300 |   2.976022 |     2.0519 |     1.9738 |  -3.960082
  350 |   2.733317 |     2.0288 |     2.0402 |  -3.876680
  400 |   2.587035 |     2.0073 |     2.0769 |  -3.803166
  450 |   2.482397 |     1.9876 |     2.0946 |  -3.736156
  500 |   2.396507 |     1.9695 |     2.1000 |  -3.673557
  550 |   2.319743 |     1.9527 |     2.0976 |  -3.614066
  600 |   2.248035 |     1.9368 |     2.0902 |  -3.556855
  650 |   2.179636 |     1.9216 |     2.0796 |  -3.501393
  700 |   2.113776 |     1.9070 |     2.0671 |  -3.447340
  750 |   2.050099 |     1.8928 |     2.0535 |  -3.394471
  800 |   1.988422 |     1.8790 |     2.0392 |  -3.342639
  850 |   1.92

# Simple Example (2 variables with Matrix)

In [4]:
x_data = [
    [1., 0., 3., 0., 5.],
    [0., 2., 0., 4., 0.]
]
y_data  = [1, 2, 3, 4, 5]

W = tf.Variable(tf.random_uniform([1, 2], -1.0, 1.0))
b = tf.Variable(tf.random_uniform([1], -1.0, 1.0))

learning_rate = tf.Variable(0.001)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data) + b # (1, 2) * (2, 5) = (1, 5)
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))

        W_grad, b_grad = tape.gradient(cost, [W, b])
        W.assign_sub(learning_rate * W_grad)
        b.assign_sub(learning_rate * b_grad)
    
    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.6f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], b.numpy()[0]))

    0 |  36.403778 |    -0.6231 |    -0.3508 |  -0.961774
   50 |   9.372900 |     0.2914 |     0.1682 |  -0.557764
  100 |   2.639858 |     0.7060 |     0.4867 |  -0.347756
  150 |   0.825069 |     0.8912 |     0.6846 |  -0.235665
  200 |   0.284990 |     0.9721 |     0.8088 |  -0.174012
  250 |   0.106844 |     1.0062 |     0.8873 |  -0.138953
  300 |   0.042677 |     1.0195 |     0.9372 |  -0.118279
  350 |   0.018044 |     1.0241 |     0.9690 |  -0.105598
  400 |   0.008188 |     1.0250 |     0.9893 |  -0.097477
  450 |   0.004138 |     1.0246 |     1.0022 |  -0.092026
  500 |   0.002439 |     1.0239 |     1.0104 |  -0.088173
  550 |   0.001710 |     1.0230 |     1.0156 |  -0.085299
  600 |   0.001384 |     1.0223 |     1.0188 |  -0.083036
  650 |   0.001227 |     1.0217 |     1.0207 |  -0.081161
  700 |   0.001142 |     1.0212 |     1.0218 |  -0.079538
  750 |   0.001088 |     1.0207 |     1.0224 |  -0.078080
  800 |   0.001046 |     1.0203 |     1.0227 |  -0.076735
  850 |   0.00

# Hypothesis without b

In [4]:
# 앞의 코드에서 bias(b)를 행렬에 추가
x_data = [
    [1., 1., 1., 1., 1.], # bias(b)
    [1., 0., 3., 0., 5.], 
    [0., 2., 0., 4., 0.]
]
y_data  = [1, 2, 3, 4, 5]

W = tf.Variable(tf.random_uniform([1, 3], -1.0, 1.0)) # [1, 3]으로 변경하고, b 삭제

learning_rate = 0.001
optimizer = tf.train.GradientDescentOptimizer(learning_rate)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data) # b가 없다
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))

    grads = tape.gradient(cost, [W])
    optimizer.apply_gradients(grads_and_vars=zip(grads,[W]))
    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.4f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], W.numpy()[0][2]))

    0 |  15.840134 |     0.9544 |    -0.7136 |     0.5361
   50 |   3.716712 |     1.1373 |     0.0144 |     0.5841
  100 |   1.046846 |     1.2111 |     0.3590 |     0.6044
  150 |   0.452741 |     1.2345 |     0.5237 |     0.6136
  200 |   0.315059 |     1.2348 |     0.6039 |     0.6188
  250 |   0.278092 |     1.2246 |     0.6443 |     0.6228
  300 |   0.263582 |     1.2098 |     0.6660 |     0.6268
  350 |   0.254260 |     1.1931 |     0.6788 |     0.6311
  400 |   0.246290 |     1.1756 |     0.6873 |     0.6356
  450 |   0.238807 |     1.1581 |     0.6938 |     0.6404
  500 |   0.231608 |     1.1406 |     0.6993 |     0.6453
  550 |   0.224640 |     1.1233 |     0.7043 |     0.6504
  600 |   0.217887 |     1.1063 |     0.7090 |     0.6554
  650 |   0.211338 |     1.0895 |     0.7135 |     0.6605
  700 |   0.204985 |     1.0730 |     0.7179 |     0.6655
  750 |   0.198824 |     1.0568 |     0.7222 |     0.6705
  800 |   0.192849 |     1.0408 |     0.7264 |     0.6755
  850 |   0.18

# Custom Gradient
* tf.train.GradientDescentOptimizer(): optimizer
* optimizer.apply_gradients(): update

In [6]:
# Multi-variable linear regression (1)

X = tf.constant([[1., 2.], 
                 [3., 4.]])
y = tf.constant([[1.5], [3.5]])

W = tf.Variable(tf.random_normal([2, 1]))
b = tf.Variable(tf.random_normal([1]))

# Create an optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)

n_epoch = 1000+1
print("epoch | cost")
for i in range(n_epoch):
    # Use tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        y_pred = tf.matmul(X, W) + b
        cost = tf.reduce_mean(tf.square(y_pred - y))

    # calculates the gradients of the loss
    grads = tape.gradient(cost, [W, b])
    
    # updates parameters (W and b)
    optimizer.apply_gradients(grads_and_vars=zip(grads, [W, b]))
    if i % 50 == 0:
        print("{:5} | {:10.6f}".format(i, cost.numpy()))


epoch | cost
    0 |  21.726822
   50 |   0.258689
  100 |   0.176868
  150 |   0.120926
  200 |   0.082678
  250 |   0.056528
  300 |   0.038649
  350 |   0.026424
  400 |   0.018067
  450 |   0.012352
  500 |   0.008445
  550 |   0.005774
  600 |   0.003948
  650 |   0.002699
  700 |   0.001845
  750 |   0.001262
  800 |   0.000863
  850 |   0.000590
  900 |   0.000403
  950 |   0.000276
 1000 |   0.000189


# Predicting exam score
regression using three inputs (x1, x2, x3)

x1 (quiz 1) | x2 (quiz 2) | x3 (mid 1) | Y (final)
---- | ---- | ----| ----
73 | 80 | 75 | 152
93 | 88 | 93 | 185
89 | 91 | 90 | 180
96 | 98 | 100 | 196
73 | 66 | 70 | 142

In [7]:
tf.set_random_seed(0)  # for reproducibility


```python
x1 = [ 73.,  93.,  89.,  96.,  73.]
x2 = [ 80.,  88.,  91.,  98.,  66.]
x3 = [ 75.,  93.,  90., 100.,  70.]
Y  = [152., 185., 180., 196., 142.]

# weights
w1 = tf.Variable(10.)
w2 = tf.Variable(10.)
w3 = tf.Variable(10.)
b  = tf.Variable(10.)

hypothesis = w1 * x1 +  w2 * x2 + w3 * x3 + b        
```

In [8]:
# data and label
x1 = [ 73.,  93.,  89.,  96.,  73.]
x2 = [ 80.,  88.,  91.,  98.,  66.]
x3 = [ 75.,  93.,  90., 100.,  70.]
Y  = [152., 185., 180., 196., 142.]

# weights
w1 = tf.Variable(10.)
w2 = tf.Variable(10.)
w3 = tf.Variable(10.)
b  = tf.Variable(10.)

learning_rate = 0.000001

for i in range(1000+1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        hypothesis = w1 * x1 +  w2 * x2 + w3 * x3 + b
        cost = tf.reduce_mean(tf.square(hypothesis - Y))
    # calculates the gradients of the cost
    w1_grad, w2_grad, w3_grad, b_grad = tape.gradient(cost, [w1, w2, w3, b])
    
    # update w1,w2,w3 and b
    w1.assign_sub(learning_rate * w1_grad)
    w2.assign_sub(learning_rate * w2_grad)
    w3.assign_sub(learning_rate * w3_grad)
    b.assign_sub(learning_rate * b_grad)

    if i % 50 == 0:
      print("{:5} | {:12.4f}".format(i, cost.numpy()))

    0 | 5793889.5000
   50 |   64291.1484
  100 |     715.2902
  150 |       9.8462
  200 |       2.0152
  250 |       1.9252
  300 |       1.9210
  350 |       1.9177
  400 |       1.9145
  450 |       1.9114
  500 |       1.9081
  550 |       1.9050
  600 |       1.9018
  650 |       1.8986
  700 |       1.8955
  750 |       1.8923
  800 |       1.8892
  850 |       1.8861
  900 |       1.8829
  950 |       1.8798
 1000 |       1.8767


## Multi-variable linear regression (1)
*  random  초기화: tf.random_normal()


In [3]:
import tensorflow as tf
import numpy as np
# data and label
x1 = [ 73.,  93.,  89.,  96.,  73.]
x2 = [ 80.,  88.,  91.,  98.,  66.]
x3 = [ 75.,  93.,  90., 100.,  70.]
Y  = [152., 185., 180., 196., 142.]

# random weights
w1 = tf.Variable(tf.random_normal([1]))
w2 = tf.Variable(tf.random_normal([1]))
w3 = tf.Variable(tf.random_normal([1]))
b  = tf.Variable(tf.random_normal([1]))

learning_rate = 0.000001

for i in range(1000+1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        hypothesis = w1 * x1 +  w2 * x2 + w3 * x3 + b
        cost = tf.reduce_mean(tf.square(hypothesis - Y))
    # calculates the gradients of the cost
    w1_grad, w2_grad, w3_grad, b_grad = tape.gradient(cost, [w1, w2, w3, b])
    
    # update w1,w2,w3 and b
    w1.assign_sub(learning_rate * w1_grad)
    w2.assign_sub(learning_rate * w2_grad)
    w3.assign_sub(learning_rate * w3_grad)
    b.assign_sub(learning_rate * b_grad)

    if i % 50 == 0:
      print("{:5} | {:12.4f}".format(i, cost.numpy()))


ModuleNotFoundError: No module named 'tensorflow'

## Multi-variable linear regression (2)

* Matrix 사용

In [1]:
data = np.array([
    # X1,   X2,    X3,   y
    [ 73.,  80.,  75., 152. ],
    [ 93.,  88.,  93., 185. ],
    [ 89.,  91.,  90., 180. ],
    [ 96.,  98., 100., 196. ],
    [ 73.,  66.,  70., 142. ]
], dtype=np.float32)

# slice data
X = data[:, :-1]
y = data[:, [-1]]

W = tf.Variable(tf.random_normal([3, 1]))
b = tf.Variable(tf.random_normal([1]))

learning_rate = 0.000001

# hypothesis, prediction function
def predict(X):
    return tf.matmul(X, W) + b

print("epoch | cost")

n_epochs = 2000
for i in range(n_epochs+1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        cost = tf.reduce_mean((tf.square(predict(X) - y)))

    # calculates the gradients of the loss
    W_grad, b_grad = tape.gradient(cost, [W, b])

    # updates parameters (W and b)
    W.assign_sub(learning_rate * W_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 100 == 0:
        print("{:5} | {:10.4f}".format(i, cost.numpy()))

NameError: name 'np' is not defined

In [11]:
W.numpy()

array([[ 1.368576 ],
       [ 2.1047728],
       [-1.4229954]], dtype=float32)

In [12]:
b.numpy()

array([-1.1783521], dtype=float32)

In [13]:
tf.matmul(X, W) + b

<tf.Tensor: id=383121, shape=(5, 1), dtype=float32, numpy=
array([[160.38487],
       [178.98064],
       [184.08965],
       [194.17314],
       [138.03304]], dtype=float32)>

## predict

In [14]:
Y # labels, 실제값

[152.0, 185.0, 180.0, 196.0, 142.0]

In [15]:
predict(X).numpy() # prediction, 예측값

array([[160.38487],
       [178.98064],
       [184.08965],
       [194.17314],
       [138.03304]], dtype=float32)

In [16]:
# 새로운 데이터에 대한 예측

predict([[ 89.,  95.,  92.],[ 84.,  92.,  85.]]).numpy() 

array([[189.66275],
       [186.46652]], dtype=float32)