# 04 - Multi-variable Linear Regression

In [1]:
import tensorflow as tf
import numpy as np

print(tf.__version__)

2.3.0


## Multi-variable linear regression

Predicting exam score - regression using three inputs (x1, x2, x3)
![image](https://user-images.githubusercontent.com/37262132/109262594-2abc6b80-7845-11eb-9fd1-9edc297ee644.png)
Test Scores for General Psychology ( https://goo.gl/g2T8Kp )

## Matrix multiplication

### dot product(=scalar product, 내적)

![image](https://user-images.githubusercontent.com/37262132/109263699-12e5e700-7847-11eb-9ea7-1a4826261842.png)
https://www.mathsisfun.com/algebra/matrix-multiplying.html

## Multi-feature regression

### Hypothesis

![image](https://user-images.githubusercontent.com/37262132/109263180-29d80980-7846-11eb-9287-17be790b7bd3.png)


## Hypothesis using matrix¶

![image](https://user-images.githubusercontent.com/37262132/109263228-407e6080-7846-11eb-8961-5aef8bc14174.png)
![image](https://user-images.githubusercontent.com/37262132/109263241-48d69b80-7846-11eb-8ad8-26ddc9d0cbb3.png)
(W, X 는 matrix)

## Hypothesis without b

![image](https://user-images.githubusercontent.com/37262132/109263294-5ee45c00-7846-11eb-80bc-87b13fdce65b.png)

### Many x instances

![image](https://user-images.githubusercontent.com/37262132/109263348-70c5ff00-7846-11eb-8ffe-57de2a491cb5.png)

5는 데이터(instance)의 수, 3은 변수(feature)의 수, 1은 결과

## Hypothesis using matrix (n output)

![image](https://user-images.githubusercontent.com/37262132/109263501-b4b90400-7846-11eb-9610-b26ff78ada37.png)
- n은 데이터(instance)의 개수, 2는 결과 값의 개수로 주어진다.
- 이때, W [?, ?] ⇒ [3, 2]

## WX vs XW

### Theory (Lecture) :

![image](https://user-images.githubusercontent.com/37262132/109263557-cf8b7880-7846-11eb-913e-0655d925249d.png)

### TensorFlow (Implementation) :

![image](https://user-images.githubusercontent.com/37262132/109263572-d4502c80-7846-11eb-9218-d36f2df63825.png)

## Simple Example (2 variables)

![image](https://user-images.githubusercontent.com/37262132/109263646-fa75cc80-7846-11eb-888a-b20ae40d3e77.png)

In [15]:
tf.random.set_seed(0)  # for reproducibility

In [7]:
x1_data = [1, 0, 3, 0, 5]
x2_data = [0, 2, 0, 4, 0]
y_data = [1, 2, 3, 4, 5]

W1 = tf.Variable(tf.random.uniform((1,), -10.0, 10.0))
W2 = tf.Variable(tf.random.uniform((1,), -10.0, 10.0))
b = tf.Variable(tf.random.uniform((1,), -10.0, 10.0))

learning_rate = tf.Variable(0.001)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = W1 * x1_data + W2 * x2_data + b
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))
        
    W1_grad, W2_grad, b_grad = tape.gradient(cost, [W1, W2, b])
    W1.assign_sub(learning_rate * W1_grad)
    W2.assign_sub(learning_rate * W2_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.6f}".format(
            i, cost.numpy(), W1.numpy()[0], W2.numpy()[0], b.numpy()[0]))

    0 | 335.280823 |    -4.0663 |     1.1220 |  -6.065215
   50 |  76.037262 |    -0.8001 |     1.6209 |  -4.978779
  100 |  18.959265 |     0.7151 |     1.8781 |  -4.429109
  150 |   6.310240 |     1.4125 |     2.0104 |  -4.134423
  200 |   3.445082 |     1.7284 |     2.0768 |  -3.961648
  250 |   2.743659 |     1.8667 |     2.1075 |  -3.847750
  300 |   2.525401 |     1.9225 |     2.1184 |  -3.762738
  350 |   2.417754 |     1.9402 |     2.1181 |  -3.692262
  400 |   2.337300 |     1.9403 |     2.1114 |  -3.629400
  450 |   2.264998 |     1.9325 |     2.1008 |  -3.570778
  500 |   2.196329 |     1.9213 |     2.0881 |  -3.514729
  550 |   2.130126 |     1.9085 |     2.0741 |  -3.460409
  600 |   2.066037 |     1.8953 |     2.0595 |  -3.407385
  650 |   2.003917 |     1.8819 |     2.0444 |  -3.355424
  700 |   1.943679 |     1.8686 |     2.0293 |  -3.304398
  750 |   1.885258 |     1.8555 |     2.0141 |  -3.254230
  800 |   1.828595 |     1.8425 |     1.9990 |  -3.204873
  850 |   1.77

## Simple Example (2 variables with Matrix)

In [17]:
x_data = [
    [1., 0., 3., 0., 5],
    [0., 2., 0., 4., 0]
]
y_data = [1, 2, 3, 4, 5]

W = tf.Variable(tf.random.uniform((1, 2), -1.0, 1.0))
b = tf.Variable(tf.random.uniform((1,), -1.0, 1.0))

learning_rate = tf.Variable(0.001)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data) + b  # (1, 2) * (2, 5) = (1, 5)
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))
        
        W_grad, b_grad = tape.gradient(cost, [W, b])
        W.assign_sub(learning_rate * W_grad)
        b.assign_sub(learning_rate * b_grad)
    
    
    
    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.6f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], b.numpy()[0]))

    0 |  24.212524 |    -0.5847 |     0.4861 |  -0.641388
   50 |   5.439573 |     0.2771 |     0.7026 |  -0.346325
  100 |   1.247631 |     0.6764 |     0.8270 |  -0.201596
  150 |   0.296814 |     0.8607 |     0.9001 |  -0.129428
  200 |   0.075051 |     0.9451 |     0.9439 |  -0.092641
  250 |   0.020848 |     0.9834 |     0.9705 |  -0.073335
  300 |   0.006614 |     1.0005 |     0.9870 |  -0.062807
  350 |   0.002502 |     1.0080 |     0.9972 |  -0.056776
  400 |   0.001177 |     1.0110 |     1.0035 |  -0.053104
  450 |   0.000704 |     1.0122 |     1.0075 |  -0.050701
  500 |   0.000518 |     1.0125 |     1.0100 |  -0.049002
  550 |   0.000437 |     1.0124 |     1.0115 |  -0.047706
  600 |   0.000397 |     1.0123 |     1.0123 |  -0.046648
  650 |   0.000375 |     1.0121 |     1.0128 |  -0.045734
  700 |   0.000359 |     1.0119 |     1.0131 |  -0.044913
  750 |   0.000346 |     1.0116 |     1.0132 |  -0.044152
  800 |   0.000335 |     1.0114 |     1.0132 |  -0.043432
  850 |   0.00

## Hypothesis without b

In [27]:
# 앞의 코드에서 bias(b) 를 행렬에 추가
x_data = [
    [1., 1., 1., 1., 1.],  # bias(b)
    [1., 0., 3., 0., 5.],
    [0., 2., 0., 4., 0.]
]
y_data = [1, 2, 3, 4, 5]

W = tf.Variable(tf.random.uniform((1, 3), -1.0, 1.0))  # [1, 3] 으로 변경하고 b 삭제

learning_rate = 0.001
optimizer = tf.keras.optimizers.SGD(learning_rate)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data)  # b가 없다
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))
        
    grads = tape.gradient(cost, [W])
    optimizer.apply_gradients(grads_and_vars=zip(grads,[W]))
    
    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.4f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], W.numpy()[0][2]))

    0 |  16.573290 |    -0.3897 |     0.4246 |    -0.6859
   50 |   5.920134 |    -0.1282 |     0.7461 |    -0.1044
  100 |   2.262819 |     0.0180 |     0.8799 |     0.2653
  150 |   0.905716 |     0.1012 |     0.9319 |     0.5018
  200 |   0.374822 |     0.1491 |     0.9495 |     0.6539
  250 |   0.160281 |     0.1767 |     0.9535 |     0.7520
  300 |   0.071896 |     0.1923 |     0.9528 |     0.8156
  350 |   0.035031 |     0.2008 |     0.9510 |     0.8570
  400 |   0.019489 |     0.2049 |     0.9493 |     0.8842
  450 |   0.012836 |     0.2063 |     0.9482 |     0.9021
  500 |   0.009906 |     0.2061 |     0.9476 |     0.9140
  550 |   0.008540 |     0.2048 |     0.9474 |     0.9220
  600 |   0.007835 |     0.2029 |     0.9475 |     0.9276
  650 |   0.007411 |     0.2006 |     0.9478 |     0.9315
  700 |   0.007109 |     0.1980 |     0.9483 |     0.9344
  750 |   0.006863 |     0.1954 |     0.9489 |     0.9366
  800 |   0.006642 |     0.1926 |     0.9495 |     0.9383
  850 |   0.00

## Custom Gradient
- tf.train.GradientDescentOptimizer(): optimizer
- optimizer.apply_gradients(): update

In [29]:
# Multi-variable linear regression (1)

X = tf.constant([[1., 2.],
                 [3., 4.]])
y = tf.constant([[1.5], [3.5]])

W = tf.Variable(tf.random.normal((2, 1)))
b = tf.Variable(tf.random.normal((1,)))

# Create an optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

n_epoch = 1000+1
print("epoch | cost")
for i in range(n_epoch):
    # Use tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        y_pred = tf.matmul(X, W) + b
        cost = tf.reduce_mean(tf.square(y_pred - y))
        
    # calculates the gradients of the loss
    grads = tape.gradient(cost, [W, b])
    
    # updates parameters (W and b)
    optimizer.apply_gradients(grads_and_vars=zip(grads, [W, b]))
    
    if i % 50 == 0:
        print("{:5} | {:10.6f}".format(i, cost.numpy()))

epoch | cost
    0 |   0.656688
   50 |   0.047971
  100 |   0.032798
  150 |   0.022425
  200 |   0.015332
  250 |   0.010483
  300 |   0.007167
  350 |   0.004900
  400 |   0.003350
  450 |   0.002291
  500 |   0.001566
  550 |   0.001071
  600 |   0.000732
  650 |   0.000501
  700 |   0.000342
  750 |   0.000234
  800 |   0.000160
  850 |   0.000109
  900 |   0.000075
  950 |   0.000051
 1000 |   0.000035


## Predicting exam score

regression using three inputs (x1, x2, x3)

![image](https://user-images.githubusercontent.com/37262132/109269255-9572a480-784f-11eb-9ee8-8597fac2dc0e.png)

In [38]:
tf.random.set_seed(0)  # for reproducibility

In [39]:
# data and label
x1 = [ 73.,  93.,  89.,  96.,  73.]
x2 = [ 80.,  88.,  91.,  98.,  66.]
x3 = [ 75.,  93.,  90., 100.,  70.]
Y  = [152., 185., 180., 196., 142.]

# weights 
w1 = tf.Variable(10.)
w2 = tf.Variable(10.)
w3 = tf.Variable(10.)
b  = tf.Variable(10.)

learning_rate = 0.000001

for i in range(1000+1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        hypothesis = w1 * x1 + w2 * x2 + w3 * x3 + b
        cost = tf.reduce_mean(tf.square(hypothesis - Y))
    # calculates the gradients of the cost
    w1_grad, w2_grad, w3_grad, b_grad = tape.gradient(cost, [w1, w2, w3, b])
    
    # update w1, w2, w3 and b
    w1.assign_sub(learning_rate * w1_grad)
    w2.assign_sub(learning_rate * w2_grad)
    w3.assign_sub(learning_rate * w3_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 50 == 0:
        print("{:5} | {:12.4f}".format(i, cost.numpy()))

    0 | 5793889.5000
   50 |   64291.1484
  100 |     715.2902
  150 |       9.8462
  200 |       2.0152
  250 |       1.9252
  300 |       1.9210
  350 |       1.9177
  400 |       1.9145
  450 |       1.9114
  500 |       1.9081
  550 |       1.9050
  600 |       1.9018
  650 |       1.8986
  700 |       1.8955
  750 |       1.8923
  800 |       1.8892
  850 |       1.8861
  900 |       1.8829
  950 |       1.8798
 1000 |       1.8767


## Multi-variable linear regression (1)

- random 초기화: tf.random_normal()

In [40]:
# data and label
x1 = [ 73.,  93.,  89.,  96.,  73.]
x2 = [ 80.,  88.,  91.,  98.,  66.]
x3 = [ 75.,  93.,  90., 100.,  70.]
Y  = [152., 185., 180., 196., 142.]

# random weights
w1 = tf.Variable(tf.random.normal((1,)))
w2 = tf.Variable(tf.random.normal((1,)))
w3 = tf.Variable(tf.random.normal((1,)))
b = tf.Variable(tf.random.normal((1,)))

for i in range(1000+1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        hypothesis = w1 * x1 +  w2 * x2 + w3 * x3 + b
        cost = tf.reduce_mean(tf.square(hypothesis - Y))
    # calculates the gradients of the cost
    w1_grad, w2_grad, w3_grad, b_grad = tape.gradient(cost, [w1, w2, w3, b])
    
    # update w1,w2,w3 and b
    w1.assign_sub(learning_rate * w1_grad)
    w2.assign_sub(learning_rate * w2_grad)
    w3.assign_sub(learning_rate * w3_grad)
    b.assign_sub(learning_rate * b_grad)

    if i % 50 == 0:
      print("{:5} | {:12.4f}".format(i, cost.numpy()))

    0 |   11325.9121
   50 |     135.3618
  100 |      11.1817
  150 |       9.7940
  200 |       9.7687
  250 |       9.7587
  300 |       9.7489
  350 |       9.7389
  400 |       9.7292
  450 |       9.7194
  500 |       9.7096
  550 |       9.6999
  600 |       9.6903
  650 |       9.6806
  700 |       9.6709
  750 |       9.6612
  800 |       9.6517
  850 |       9.6421
  900 |       9.6325
  950 |       9.6229
 1000 |       9.6134


## Multi-variable linear regression (2)

- Matrix 사용

In [51]:
data = np.array([
    # X1,   X2,   X3,   y
    [ 73.,  80.,  75., 152. ],
    [ 93.,  88.,  93., 185. ],
    [ 89.,  91.,  90., 180. ],
    [ 96.,  98., 100., 196. ],
    [ 73.,  66.,  70., 142. ]
], dtype=np.float32)

# slice data
X = data[:, :-1]
y = data[:, [-1]]

W = tf.Variable(tf.random.normal((3, 1)))
b = tf.Variable(tf.random.normal((1,)))

learning_rate = 0.000001

# hypothesis, prediction function
def predict(X) :
    return tf.matmul(X, W) + b

print("epoch | cost")

n_epochs = 2000
for i in range(n_epochs+1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        cost = tf.reduce_mean((tf.square(predict(X) - y)))
    
    # calculates the gradients of the loss
    W_grad, b_grad = tape.gradient(cost, [W, b])
    
    # updates parameters (W and b)
    W.assign_sub(learning_rate * W_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 100 == 0:
        print("{:5} | {:10.4f}".format(i, cost.numpy()))

epoch | cost
    0 |  3743.1343
  100 |     3.6005
  200 |     3.1265
  300 |     3.1128
  400 |     3.0994
  500 |     3.0860
  600 |     3.0726
  700 |     3.0593
  800 |     3.0460
  900 |     3.0329
 1000 |     3.0199
 1100 |     3.0069
 1200 |     2.9939
 1300 |     2.9811
 1400 |     2.9682
 1500 |     2.9555
 1600 |     2.9429
 1700 |     2.9303
 1800 |     2.9177
 1900 |     2.9053
 2000 |     2.8929


In [52]:
W.numpy()

array([[ 1.0821941 ],
       [ 0.9810675 ],
       [-0.05930979]], dtype=float32)

In [53]:
b.numpy()

array([1.3517823], dtype=float32)

In [54]:
tf.matmul(X, W) + b

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[154.3891 ],
       [182.81396],
       [181.60631],
       [195.45604],
       [140.9507 ]], dtype=float32)>

## predict

In [55]:
Y  # labels, 실제값

[152.0, 185.0, 180.0, 196.0, 142.0]

In [56]:
predict(X).numpy()  # prediction, 예측값

array([[154.3891 ],
       [182.81396],
       [181.60631],
       [195.45604],
       [140.9507 ]], dtype=float32)

In [57]:
# 새로운 데이터에 대한 예측
predict([[89., 95., 92], [84., 92., 85]]).numpy()

array([[185.41197],
       [177.47295]], dtype=float32)