In [1]:
#ID : 20200090 + 20200232 + 20200516
#
# S: 5
import numpy as np
from keras.datasets import mnist

#load
(X_train, y_train), (X_test, y_test) = mnist.load_data()

#Standardize
std=np.std(X_train, axis=0)
std=np.where(std==0,1,std)
X_train=(X_train - np.mean(X_train,axis=0))/std

std1=np.std(X_test, axis=0)
std1=np.where(std1==0,1,std1)
X_test=(X_test - np.mean(X_test,axis=0))/std1
# reshape
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

# Divide data 
def train_val_split(X_train,y_train,split_ratio =0.8):
    np.random.seed(1)
    shuffle_indices = np.random.permutation(len(y_train))
    X_shuffled = X_train[shuffle_indices]
    y_shuffled = y_train[shuffle_indices]
    split_index = int(len(y_train) * split_ratio)
    # Split the data
    X_train = X_shuffled[:split_index]
    y_train = y_shuffled[:split_index]
    X_val = X_shuffled[split_index:]
    y_val = y_shuffled[split_index:]

    return X_train, X_val, y_train, y_val
X_train, X_val, y_train, y_val =train_val_split(X_train,y_train, 0.8)

num_classes = 10
"""
#one hot vector
y_train_encoded = np.eye(num_classes)[y_train.astype(int)]
y_val_encoded = np.eye(num_classes)[y_val.astype(int)]
y_test_encoded = np.eye(num_classes)[y_test.astype(int)]
"""

# implement one hot vector
def one_hot_encode(labels, num_classes):
    num_samples = len(labels)
    encoded_labels = np.zeros((num_samples, num_classes))
    encoded_labels[np.arange(num_samples), labels] = 1
    return encoded_labels
y_train_encoded = one_hot_encode(y_train, num_classes)
y_val_encoded = one_hot_encode(y_val, num_classes)
y_test_encoded = one_hot_encode(y_test, num_classes)


def sigmoid(z):
    return 1 / (1 + np.exp(-z))
def sigmoid_derivative(x):
    return x * (1 - x)

def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def initialize_weights(layer_sizes):
    np.random.seed(1)
    w = []
    b = []
    for i in range(1, len(layer_sizes)):
        input_size = layer_sizes[i - 1]
        output_size = layer_sizes[i]
        w_matrix = np.random.randn(input_size, output_size)
        b_matrix = np.random.randn(output_size)
        w.append(w_matrix)
        b.append(b_matrix)
    return w, b

def softmax(x):
    exp_scores = np.exp(x)
    probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return probabilities

def forward(X, w,b):
    a = [X] #a[0] for layer zero
    for i in range(len(w)):#num_layers
        phiz = sigmoid(np.dot(a[i], w[i]) + b[i])
        a.append(phiz)
    return a

def backward(X, y_true, a, w, b):
    num_layers = len(w)
    num_samples = X.shape[0]
    dw = []
    db = []
    error = y_true - a[-1] #at the last layer = dL_da
    dz = error * sigmoid_derivative(a[-1]) # dL_dz = (dL_da * da_dz)
    dw.insert(0, np.dot(a[-2].T, dz)) #d_w=(dL_dz * x)   respect to   # x=dz_dw
    db.insert(0, np.sum(dz, axis=0))
    # use chain rule to back
    for i in range(num_layers - 2, -1, -1):
        error = np.dot(dz, w[i + 1].T) # dz_da
        dz = error * sigmoid_derivative(a[i + 1])
        dw.insert(0, np.dot(a[i].T, dz))
        db.insert(0, np.sum(dz, axis=0))

    return dw, db



def update_weights(w, b, dw, db, lr):
    num_layers = len(w)
    for i in range(num_layers):
        w[i] += lr * dw[i]
        b[i] += lr * db[i]
    return w, b

def train(X, y, num_of_layers, size_of_layers, lr, num_epochs, batch_size, Xy_val=None):
    w, b = initialize_weights(size_of_layers)
    num_batches = X.shape[0] // batch_size

    for epoch in range(num_epochs):
        for batch in range(num_batches):
            start_idx = batch * batch_size
            end_idx = start_idx + batch_size
            X_batch = X[start_idx:end_idx]
            y_batch = y[start_idx:end_idx]

            # Forward 
            a = forward(X_batch, w, b)

            # Backward
            dw, db = backward(X_batch, y_batch, a, w, b)

            # Update w and b
            w, b = update_weights(w, b, dw, db, lr)

        # Compute training loss
        train_yp = forward(X, w, b) #a=yp
        train_loss = mse_loss(y, train_yp[-1])

        if Xy_val:
            # Compute validation loss
            val_yp = forward(Xy_val[0], w, b)
            val_loss = mse_loss(Xy_val[1], val_yp[-1])
            print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
        else:
            print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss:.4f}")

    return w, b

def NN(x, y, num_of_layers, size_of_layers):
    num_features = x.shape[1]
    layer_sizes = [num_features] + size_of_layers #[784, size_of_layers]
    print(layer_sizes)
    w_final,b_final = train(x, y, num_of_layers, layer_sizes, lr=0.1, num_epochs=10, batch_size=32, Xy_val=(X_val, y_val_encoded))

    # Testing
    test_yp = forward(X_test, w_final,b_final)
    test_loss = mse_loss(y_test_encoded, test_yp[-1])
    #accuracy = np.mean(y_test_encoded== test_yp[-1])
    accuracy = 1 - test_loss
    
    #test_yp[-1] =softmax(test_yp[-1])
    test_yp[-1]=np.argmax(test_yp[-1], axis=1)
    print("y_true")
    print(np.argmax(y_test_encoded, axis=1))
    print("Y_pred")
    print(test_yp[-1])
    test_yp[-1]=np.eye(10)[test_yp[-1]]
    print("Y_pred_encoded")
    print(test_yp[-1])
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Accuracy: {accuracy:.2%}")
    return accuracy


architectures = [
    (2, [20,10 ], "Architecture 1"),
    (3, [20, 15,10], "Architecture 2"),
    (3, [15, 20,10], "Architecture 3")
]

for num_layers, layer_sizes, architecture_name in architectures:
    print(f"\n{architecture_name}")
    accuracy = NN(X_train, y_train_encoded, num_layers, layer_sizes)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz

Architecture 1
[784, 20, 10]


  return 1 / (1 + np.exp(-z))


Epoch 1/10 - Train Loss: 0.0280 - Val Loss: 0.0289
Epoch 2/10 - Train Loss: 0.0204 - Val Loss: 0.0221
Epoch 3/10 - Train Loss: 0.0177 - Val Loss: 0.0197
Epoch 4/10 - Train Loss: 0.0162 - Val Loss: 0.0184
Epoch 5/10 - Train Loss: 0.0151 - Val Loss: 0.0175
Epoch 6/10 - Train Loss: 0.0143 - Val Loss: 0.0170
Epoch 7/10 - Train Loss: 0.0138 - Val Loss: 0.0167
Epoch 8/10 - Train Loss: 0.0132 - Val Loss: 0.0161
Epoch 9/10 - Train Loss: 0.0128 - Val Loss: 0.0160
Epoch 10/10 - Train Loss: 0.0124 - Val Loss: 0.0158
y_true
[7 2 1 ... 4 5 6]
Y_pred
[7 2 1 ... 4 5 6]
Y_pred_encoded
[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Test Loss: 0.0149
Accuracy: 98.51%

Architecture 2
[784, 20, 15, 10]
Epoch 1/10 - Train Loss: 0.0261 - Val Loss: 0.0273
Epoch 2/10 - Train Loss: 0.0192 - Val Loss: 0.0206
Epoch 3/10 - Train Loss: 0.0168 - Val Loss: 0.0185
Epoch 4/10 - Train Loss: 0.0152 - Val Loss: 0.

Architecture 2 have higher Accuracy than Architecture 3 :
Because of this reason

#1-Dimensionality reduction: 
By reducing the number of nodes in a layer, you effectively reduce the dimensionality of the representation in that layer. This can help in compressing the information and extracting the most relevant features, allowing for a more compact and efficient representation of the data.

#2-Computational efficiency: 
A smaller layer requires fewer computations compared to a larger layer. This can lead to faster training and inference times, especially when dealing with large-scale neural networks or limited computational resources.

#3-Improved generalization: 
With a smaller layer, the model is encouraged to learn more abstract and generalizable representations of the data. This can enhance the model's ability to generalize well to unseen data and perform better on test or validation datasets.

In [None]:
#cross entropy in backward

def backward(X, y_true, a, w, b):
    num_layers = len(w)
    num_samples = X.shape[0]
    dw = []
    db = []
    error =(- y_true / a[-1])+((1 - y_true) / (1 - a[-1])) #at the last layer = dL_da
    dz = error * sigmoid_derivative(a[-1]) # dL_dz = (dL_da * da_dz)
    dw.insert(0, np.dot(a[-2].T, dz)) #d_w=(dL_dz * x)   respect to   # x=dz_dw
    db.insert(0, np.sum(dz, axis=0))
    # use chain rule to back
    for i in range(num_layers - 2, -1, -1):
        error = np.dot(dz, w[i + 1].T) # dz_da
        dz = error * sigmoid_derivative(a[i + 1])
        dw.insert(0, np.dot(a[i].T, dz))
        db.insert(0, np.sum(dz, axis=0))

    return dw, db

