In [5]:
import pandas as pd
import numpy as np

x_train = pd.read_csv(r"C:/Users/manha/Downloads/train_X.csv")
x_test = pd.read_csv(r"C:/Users/manha/Downloads/test_X.csv")
y_train = pd.read_csv(r"C:/Users/manha/Downloads/train_label.csv")
y_test = pd.read_csv(r"C:/Users/manha/Downloads/test_label.csv")
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.astype(int).to_numpy()
y_test = y_test.astype(int).to_numpy()

def initialize_parameters(layer_dims):
    np.random.seed(1)
    parameters = {}
    for l in range(1, len(layer_dims)):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
    return parameters

def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / expZ.sum(axis=0, keepdims=True)

def forward_propagation(X, parameters):
    caches = {}
    A = X.T
    L = len(parameters) // 2

    for l in range(1, L):
        Z = parameters['W' + str(l)].dot(A) + parameters['b' + str(l)]
        A = relu(Z)
        caches['A' + str(l)] = A
        caches['Z' + str(l)] = Z

    ZL = parameters['W' + str(L)].dot(A) + parameters['b' + str(L)]
    AL = softmax(ZL)
    caches['A' + str(L)] = AL
    caches['Z' + str(L)] = ZL

    return AL, caches

# Step 4: Compute cost
def compute_cost(AL, Y):
    m = Y.shape[0]
    cost = -np.sum(Y.T * np.log(AL + 1e-8)) / m
    return np.squeeze(cost)

# Step 5: Backward propagation
def relu_derivative(Z):
    return Z > 0

def backward_propagation(X, Y, caches, parameters):
    grads = {}
    L = len(parameters) // 2
    m = X.shape[0]
    Y = Y.T
    A_prev = X.T

    dZL = caches['A' + str(L)] - Y
    grads['dW' + str(L)] = dZL.dot(caches['A' + str(L-1)].T) / m
    grads['db' + str(L)] = np.sum(dZL, axis=1, keepdims=True) / m

    for l in reversed(range(1, L)):
        dA = parameters['W' + str(l+1)].T.dot(dZL)
        dZ = dA * relu_derivative(caches['Z' + str(l)])
        A_prev = X.T if l == 1 else caches['A' + str(l-1)]
        grads['dW' + str(l)] = dZ.dot(A_prev.T) / m
        grads['db' + str(l)] = np.sum(dZ, axis=1, keepdims=True) / m
        dZL = dZ

    return grads

def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L + 1):
        parameters['W' + str(l)] -= learning_rate * grads['dW' + str(l)]
        parameters['b' + str(l)] -= learning_rate * grads['db' + str(l)]
    return parameters

def model(X, Y, layer_dims, learning_rate=0.01, num_epochs=1000):
    parameters = initialize_parameters(layer_dims)
    for epoch in range(num_epochs):
        AL, caches = forward_propagation(X, parameters)

        cost = compute_cost(AL, Y)

        grads = backward_propagation(X, Y, caches, parameters)

        parameters = update_parameters(parameters, grads, learning_rate)

        if epoch % 10 == 0:
            print(f"Cost after epoch {epoch}: {cost:.4f}")
    return parameters

def predict(X, parameters):
    AL, _ = forward_propagation(X, parameters)
    predictions = np.argmax(AL, axis=0)
    return predictions

def accuracy(predictions, Y):
    true_labels = np.argmax(Y, axis=1)
    return np.mean(predictions == true_labels)



layer_dims = [784,256,256,10]

parameters = model(x_train, y_train, layer_dims, learning_rate=0.005, num_epochs=100)

train_preds = predict(x_train, parameters)
test_preds = predict(x_test, parameters)

print(f"Train accuracy: {accuracy(train_preds, y_train)*100:.2f}")
print(f"Test accuracy: {accuracy(test_preds, y_test)*100:.2f}")


Cost after epoch 0: 2.3310
Cost after epoch 10: 1.2853
Cost after epoch 20: 0.7105
Cost after epoch 30: 0.5306
Cost after epoch 40: 0.4833
Cost after epoch 50: 0.3453
Cost after epoch 60: 0.4642
Cost after epoch 70: 0.2337
Cost after epoch 80: 0.1974
Cost after epoch 90: 0.1694
Train accuracy: 97.50
Test accuracy: 87.39
