# Full-connected layer

In [1]:
import tensorflow as tf
                                                           # (0) input
X = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])        #     X = [[x1, x2, x3 for sample1], [x1, x2, x3 for sample2]]
y = tf.constant([[10.0], [20.0]])                          #     Y = [[y for sample1], [y for sample2]]


class Linear(tf.keras.Model):                              # (1) define a linear model: y = w1*x1 + w2*x2 + w3*x3 + b
    def __init__(self):
        super().__init__()
        
        # initailize a Fully-connected Layer named self.dense using tf.keras.layers.Dense
        self.dense = tf.keras.layers.Dense(units = 1,                                   # number of layers
                                           activation = None,                           # activation function
                                           kernel_initializer = tf.zeros_initializer(), # initialize vector w = [w1, w2, w3]
                                           bias_initializer = tf.zeros_initializer())   # initialize bias b

    def call(self, input): # input: number of samples, or batch. In this example, there are two samples in X. 
        # call model (deal with input and output)
        output = self.dense(input)
        return output


model = Linear()                                            # (2) define an object of Linear()
optimizer = tf.keras.optimizers.SGD(learning_rate = 0.01)   # (3) define an optimizer: SGD
for i in range(100):                                        # (4) training for 100 epoches
    with tf.GradientTape() as tape:                         # (5) record y_pred and loss in tape
        y_pred = model(X)                                   #     call function y_pred = model(X) to compute y_predict
        loss = tf.reduce_mean(tf.square(y_pred - y))        #     compute loss function: Mean Square Error (MSE)
        
    grads = tape.gradient(loss, model.variables)            # (6) compute grads with loss and parameters [w1, w2, w3, b]
    
    optimizer.apply_gradients(grads_and_vars = 
                              zip(grads, model.variables))  # (7) optimize paramters [w1, w2, w3, b]
    
print(model.variables)                                      # (8) output [w1, w2, w3, b]

[<tf.Variable 'linear/dense/kernel:0' shape=(3, 1) dtype=float32, numpy=
array([[0.40784496],
       [1.191065  ],
       [1.9742855 ]], dtype=float32)>, <tf.Variable 'linear/dense/bias:0' shape=(1,) dtype=float32, numpy=array([0.78322077], dtype=float32)>]


In [2]:
W = [[0.40784496],
     [1.19106500],
     [1.97428550]]
W

[[0.40784496], [1.191065], [1.9742855]]

In [3]:
b = [0.78322077]
b

[0.78322077]

**Result:**

$$y = W*X + b = 0.40784496*x1 + 1.19106500*x2 + 1.97428550*x3 + 0.78322077$$

Reference: https://tf.wiki/en/basic/models.html#a-basic-example-multilayer-perceptron-mlp