In [2]:
import tensorflow as tf

2025-07-07 13:35:33.286364: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [28]:
class MyDenseLayer(tf.keras.layers.Layer):
    def __init__(self, input_dim, output_dim):
        super(MyDenseLayer, self).__init__()

        self.w = self.add_weight([input_dim, output_dim])
        self.b = self.add_weight([1, output_dim])

    def call(self, inputs):
        z = tf.matmul(inputs, self.w) + self.b
        return tf.math.sigmoid(z)
    
    def set_weights(self, weights):
        self.w.assign(weights[0: len(self.w - 1)])
        self.b.assign(weights[-1])

In [5]:
input_dim = 3
output_dim = 2
layer = MyDenseLayer(input_dim, output_dim)
inputs = tf.constant([[1.0, 2.0, 3.0]])
outputs = layer(inputs)
print(outputs.numpy())

[[0.8867185 0.9546766]]


In [27]:
class SequentialModel(tf.keras.Model):
    def __init__(self, layers):
        super(SequentialModel, self).__init__()
        self.model_layers = layers

    def layers(self):
        return self.model_layers
    
    def call(self, inputs):
        x = inputs 
        for layer in self.model_layers:
            x = layer(x)
        return x

In [13]:
layers = []
layers.append(MyDenseLayer(3, 4))
layers.append(MyDenseLayer(4, 3))
model = SequentialModel(layers)
inputs = tf.constant([[1.0, 2.0, 3.0]])
outputs = model(inputs)
print(outputs.numpy())

[[0.70461786 0.30614072 0.6488198 ]]


In [14]:
def backprop(inputs, outputs, model, loss_fn):
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
    while True:
        predictions = model(inputs)
        with tf.GradientTape() as tape:
            loss = loss_fn(outputs, predictions)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        if tf.reduce_sum(tf.abs(gradients)) < 1e-6:
            break
    return gradients

#### Two layer SGD

In [19]:
import numpy as np
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
lr = 0.01
weights_input_hidden = np.random.uniform(size=(2, 2))
weights_hidden_output = np.random.uniform(size=(2, 1))

In [23]:
# Sigmoid activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [24]:
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

In [None]:
def manual_SGD(final_output, weights_hidden_output):
        error = y - final_output

        d_predicted_output = error * sigmoid_derivative(final_output)
        error_hidden_layer = d_predicted_output.dot(weights_hidden_output.T)
        return X.T.dot(error_hidden_layer)

In [30]:
def binary_cross_entropy(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))


def binary_cross_entropy_derivative(y_true, y_pred):
    return y_pred - y_true

In [32]:
def train_two_layer(X, y, hidden_size=4, lr=0.1, epochs=1000):
    input_size = X.shape[1]
    output_size = 1

    # Initialize weights
    np.random.seed(0)
    W1 = np.random.randn(input_size, hidden_size)
    b1 = np.zeros((1, hidden_size))

    W2 = np.random.randn(hidden_size, output_size)
    b2 = np.zeros((1, output_size))

    for epoch in range(epochs):
        z1 = np.dot(X, W1) + b1
        a1 = sigmoid(z1)

        z2 = np.dot(a1, W2) + b2
        a2 = sigmoid(z2)
        loss = binary_cross_entropy(y, a2)
        
        dz2 = binary_cross_entropy_derivative(y, a2)  # (batch, 1)
        dW2 = np.dot(a1.T, dz2)  # (hidden, 1)
        db2 = np.sum(dz2, axis=0, keepdims=True)

        # Hidden layer
        da1 = np.dot(dz2, W2.T)  # (batch, hidden)
        dz1 = da1 * sigmoid_derivative(a1)  # (batch, hidden)
        dW1 = np.dot(X.T, dz1)  # (input, hidden)
        db1 = np.sum(dz1, axis=0, keepdims=True)
        
        W2 -= lr * dW2
        b2 -= lr * db2
        W1 -= lr * dW1
        b1 -= lr * db1

        # Optional logging
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")

    return W1, b1, W2, b2