In [1]:
import torch
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np

In [2]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainset = datasets.MNIST('', download=True, train=True, transform=transform)
testset = datasets.MNIST('', download=True, train=False, transform=transform)

In [3]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=10, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=10, shuffle=True)

Here we are performing mini-batch gradient with batch size of 10.

Below is the code for tanh activation function with epoch size of 25, 50, 100, 150 and fluctuating hidden layer size with number of neurons as 25, 50, 100, 150.

In [4]:
learning_rate = 0.001
epochs_list = [25, 50, 100, 150]
hidden_sizes = [25, 50, 100, 150]

In [5]:
input_size = 784
output_size = 10 

In [6]:
def forward(input, w1, w2, activation='tanh'):
    h = input.mm(w1)
    if activation == 'tanh':
        h_activation = h.tanh()
    elif activation == 'relu':
        h_activation = h.clamp(min=0)
    y_pred = h_activation.mm(w2)
    y_pred_sf = y_pred.softmax(dim=1)
    return y_pred_sf, h_activation

In [7]:
def backward(grad_output, h_activation, w2, activation='tanh'):
    if activation == 'tanh':
        grad_h = grad_output.mm(w2.t()) * (1 - h_activation.tanh() ** 2)
    elif activation == 'relu':
        grad_h = grad_output.mm(w2.t()) * (h_activation > 0).float()
    return grad_h

In [None]:
for activation_function in ['tanh', 'relu']:
    all_accuracies_epoch = []

    for hidden_size in hidden_sizes:
        accuracies_epoch_for_hidden = []

        for epochs in epochs_list:
            # Initialize weights
            w1 = torch.randn(input_size, hidden_size, requires_grad=True)
            w2 = torch.randn(hidden_size, output_size, requires_grad=True)

            for epoch in range(epochs):
                # Training loop
                for images, labels in trainloader:
                    images = images.view(images.shape[0], -1)
                    y_pred_sf, h_activation = forward(images, w1, w2, activation=activation_function)
                    loss = -torch.log(y_pred_sf[range(images.shape[0]), labels]).mean()

                    # Backward pass
                    grad_y_pred = y_pred_sf.clone()
                    grad_y_pred[range(images.shape[0]), labels] -= 1
                    grad_w2 = h_activation.t().mm(grad_y_pred)
                    grad_h = backward(grad_y_pred, h_activation, w2, activation=activation_function)
                    grad_w1 = images.t().mm(grad_h)

                    # Update weights
                    with torch.no_grad():
                        w1 = w1 - learning_rate * grad_w1
                        w2 = w2 - learning_rate * grad_w2

                # Check for every 50 epochs to print the progress
                if (epoch + 1) % 10 == 0 or epoch == 0 or epoch + 1 == epochs:
                    print(f'Activation: {activation_function}, Epoch: {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')


            # Evaluation loop
            correct = 0
            total = 0
            for images, labels in testloader:
                images = images.view(images.shape[0], -1)
                y_pred_sf, _ = forward(images, w1, w2, activation=activation_function)
                predictions = y_pred_sf.argmax(dim=1)
                total += labels.size(0)
                correct += (predictions == labels).sum().item()
            
            accuracy = (correct / total) * 100
            accuracies_epoch_for_hidden.append(accuracy)
        
        all_accuracies_epoch.append(accuracies_epoch_for_hidden)
    
    # Plot accuracy vs. number of epochs for different hidden layer sizes
    plt.figure(figsize=(12, 6))
    for i, hidden_size in enumerate(hidden_sizes):
        plt.plot(epochs_list, all_accuracies_epoch[i], marker='o', label=f'Hidden Layers = {hidden_size}')
    plt.xlabel('Number of Epochs')
    plt.ylabel('Accuracy (%)')
    plt.title(f'Accuracy vs. Number of Epochs for Different Hidden Layer Sizes using {activation_function.upper()} activation')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot accuracy vs. number of hidden layers for different epochs
    plt.figure(figsize=(12, 6))
    for i, epoch in enumerate(epochs_list):
        plt.plot(hidden_sizes, [all_accuracies_epoch[j][i] for j in range(len(hidden_sizes))], marker='o', label=f'Epochs = {epoch}')
    plt.xlabel('Number of Hidden Layers')
    plt.ylabel('Accuracy (%)')
    plt.title(f'Accuracy vs. Number of Hidden Layers for Different Epochs using {activation_function.upper()} activation')
    plt.legend()
    plt.grid(True)
    plt.show()

Activation: tanh, Epoch: 1/25, Loss: 0.6945
Activation: tanh, Epoch: 10/25, Loss: 0.9142
Activation: tanh, Epoch: 20/25, Loss: 0.6249
Activation: tanh, Epoch: 25/25, Loss: 0.6739
Activation: tanh, Epoch: 1/50, Loss: 0.5501
Activation: tanh, Epoch: 10/50, Loss: 0.1502
Activation: tanh, Epoch: 20/50, Loss: 0.3340
Activation: tanh, Epoch: 30/50, Loss: 2.1500
Activation: tanh, Epoch: 40/50, Loss: 0.1789
Activation: tanh, Epoch: 50/50, Loss: 0.2024
Activation: tanh, Epoch: 1/100, Loss: 0.6815
Activation: tanh, Epoch: 10/100, Loss: 0.1112
Activation: tanh, Epoch: 20/100, Loss: 0.4918
Activation: tanh, Epoch: 30/100, Loss: 0.2053
Activation: tanh, Epoch: 40/100, Loss: 1.0827


In [4]:
learning_rate = 0.001
epochs_list = [25, 50, 100, 150]
hidden_sizes = [25, 50, 100, 150]
accuracies_epoch = []
accuracies_hidden = []

# Define the model architecture
input_size = 784
output_size = len(trainloader.dataset.classes)

for hidden_size in hidden_sizes:
    accuracies_epoch_for_hidden = []
    for epochs in epochs_list:
        # Initialize weights with random values
        w1 = torch.randn(input_size, hidden_size, requires_grad=True)
        w2 = torch.randn(hidden_size, output_size, requires_grad=True)

        for epoch in range(epochs):
            # Training loop
            total_loss = 0.0
            for images, labels in trainloader:
                images = images.view(images.shape[0], -1)

                # Forward pass
                h = images.mm(w1)
                h_tan = h.tanh()
                y_pred = h_tan.mm(w2)
                y_pred_sf = y_pred.softmax(dim=1)

                # Compute the loss
                loss = -torch.log(y_pred_sf[range(images.shape[0]), labels]).mean()

                # Backpropagation
                grad_y_pred = y_pred_sf.clone()
                grad_y_pred[range(images.shape[0]), labels] -= 1
                grad_w2 = h_tan.t().mm(grad_y_pred)
                grad_h_tan = grad_y_pred.mm(w2.t())
                grad_h = grad_h_tan * (1 - h_tan**2)
                grad_w1 = images.t().mm(grad_h)

                # Update weights manually
                with torch.no_grad():
                    w1 -= learning_rate * grad_w1
                    w2 -= learning_rate * grad_w2

                total_loss += loss.item()

            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(trainloader)}')

        # Evaluation
        correct = 0
        total = 0
        with torch.no_grad():
            for x, y in testloader:
                x = x.view(x.shape[0], -1)
                h_tan = x.mm(w1).tanh()
                y_pred = h_tan.mm(w2)
                predictions = torch.argmax(y_pred, dim=1)
                total += y.size(0)
                correct += (predictions == y).sum().item()

        accuracy = (correct / total) * 100
        accuracies_epoch_for_hidden.append(accuracy)

        print(f"Epoch {epoch+1} Accuracy = {accuracy:.2f}%, Loss = {loss:.4f}")

    accuracies_epoch.append(accuracies_epoch_for_hidden)

# Plot the accuracy against the number of epochs
plt.figure(figsize=(12, 6))
for i, hidden_size in enumerate(hidden_sizes):
    plt.plot(epochs_list, accuracies_epoch[i], marker='o', label=f'Hidden Layers = {hidden_size}')

plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Epochs for Different Hidden Layer Sizes')
plt.legend()
plt.grid(True)
plt.show()

# Plot the accuracy against the number of hidden layers
plt.figure(figsize=(12, 6))
for i, epoch in enumerate(epochs_list):
    plt.plot(hidden_sizes, [accuracies_epoch[j][i] for j in range(len(hidden_sizes))], marker='o', label=f'Epochs = {epoch}')

plt.xlabel('Number of Hidden Layers')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Hidden Layers for Different Epochs')
plt.legend()
plt.grid(True)
plt.show()

Epoch [1/25], Loss: 2.0974880752364795
Epoch [2/25], Loss: 1.3303016740878424
Epoch [3/25], Loss: 1.0831014469688138
Epoch [4/25], Loss: 0.9503337954630454
Epoch [5/25], Loss: 0.8667799733765423
Epoch [6/25], Loss: 0.8049362244481841
Epoch [7/25], Loss: 0.7573151997886598
Epoch [8/25], Loss: 0.715014147259295
Epoch [9/25], Loss: 0.68192821407939
Epoch [10/25], Loss: 0.6532106630106767
Epoch [11/25], Loss: 0.626706531536455
Epoch [12/25], Loss: 0.6059293484377364
Epoch [13/25], Loss: 0.5900384186127533
Epoch [14/25], Loss: 0.5732725455220788
Epoch [15/25], Loss: 0.5610282144378871
Epoch [16/25], Loss: 0.5489829223540922
Epoch [17/25], Loss: 0.5376967935990542
Epoch [18/25], Loss: 0.5270814560881505
Epoch [19/25], Loss: 0.5175004238349696


KeyboardInterrupt: 

Below is the code for relu activation function with mini-batch gradient - batch size is 10. Epochs are 25, 50, 100, 150 and Hidden layer neurons count as  25, 50, 100, 150.

In [None]:
# Define hyperparameters
learning_rate = 0.001
epochs_list = [25, 50, 100, 150]
hidden_sizes = [25, 50, 100, 150]
accuracies_epoch = []
accuracies_hidden = []

# Define the model architecture
input_size = 784
output_size = len(trainloader.dataset.classes)

for hidden_size in hidden_sizes:
    accuracies_epoch_for_hidden = []
    for epochs in epochs_list:
        # Initialize weights with random values
        w1 = torch.randn(input_size, hidden_size, requires_grad=True)
        w2 = torch.randn(hidden_size, output_size, requires_grad=True)

        for epoch in range(epochs):
            # Training loop
            total_loss = 0.0
            for images, labels in trainloader:
                images = images.view(images.shape[0], -1)

                # Forward pass
                h = images.mm(w1)
                h_relu = h.clamp(min=0)
                y_pred = h_relu.mm(w2)
                y_pred_sf = y_pred.softmax(dim=1)

                # Compute the loss
                loss = -torch.log(y_pred_sf[range(images.shape[0]), labels]).mean()

                # Backpropagation
                grad_y_pred = y_pred_sf.clone()
                grad_y_pred[range(images.shape[0]), labels] -= 1  # Derivative of cross-entropy loss w.r.t. y_pred
                grad_w2 = h_relu.t().mm(grad_y_pred)
                grad_h_relu = grad_y_pred.mm(w2.t())
                grad_h = grad_h_relu * (h > 0).float()  # Derivative of ReLU activation
                grad_w1 = images.t().mm(grad_h)

                # Update weights manually
                with torch.no_grad():
                    w1 -= learning_rate * grad_w1
                    w2 -= learning_rate * grad_w2

                total_loss += loss.item()

            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(trainloader)}')

        # Evaluation
        correct = 0
        total = 0
        with torch.no_grad():
            for x, y in testloader:
                x = x.view(x.shape[0], -1)
                # Forward pass with ReLU activation
                h = x.mm(w1)
                h_relu = h.clamp(min=0)
                y_pred = h_relu.mm(w2)
                predictions = torch.argmax(y_pred, dim=1)
                total += y.size(0)
                correct += (predictions == y).sum().item()

        accuracy = (correct / total) * 100
        accuracies_epoch_for_hidden.append(accuracy)

        print(f"Epoch {epoch+1} Accuracy = {accuracy:.2f}%, Loss = {loss:.4f}")

    accuracies_epoch.append(accuracies_epoch_for_hidden)

# Plot the accuracy against the number of epochs
plt.figure(figsize=(12, 6))
for i, hidden_size in enumerate(hidden_sizes):
    plt.plot(epochs_list, accuracies_epoch[i], marker='o', label=f'Hidden Layers = {hidden_size}')

plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Epochs for Different Hidden Layer Sizes')
plt.legend()
plt.grid(True)
plt.show()

# Plot the accuracy against the number of hidden layers
plt.figure(figsize=(12, 6))
for i, epoch in enumerate(epochs_list):
    plt.plot(hidden_sizes, [accuracies_epoch[j][i] for j in range(len(hidden_sizes))], marker='o', label=f'Epochs = {epoch}')

plt.xlabel('Number of Hidden Layers')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Hidden Layers for Different Epochs')
plt.legend()
plt.grid(True)
plt.show()


Now after finishing mini-batch gradient we are moving to stochastic gradient.
Similar to mini-batch we are using Epoch size as 25, 50, 100, 150 and Hidden layer neurons as 25, 50, 100, 150

In [None]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=True)

Further we will code for:

Activation function - Tanh

Epoch - 25, 50, 100, 150

Hidden layer - 25, 50, 100, 150

Stochastic Gradient descent

In [None]:
learning_rate = 0.001
epochs_list = [25, 50, 100, 150]
hidden_sizes = [25, 50, 100, 150]
accuracies_epoch = []
accuracies_hidden = []

# Define the model architecture
input_size = 784
output_size = len(trainloader.dataset.classes)

for hidden_size in hidden_sizes:
    accuracies_epoch_for_hidden = []
    for epochs in epochs_list:
        # Initialize weights with random values
        w1 = torch.randn(input_size, hidden_size, requires_grad=True)
        w2 = torch.randn(hidden_size, output_size, requires_grad=True)

        for epoch in range(epochs):
            # Training loop
            total_loss = 0.0
            for images, labels in trainloader:
                images = images.view(images.shape[0], -1)

                # Forward pass
                h = images.mm(w1)
                h_tan = h.tanh()
                y_pred = h_tan.mm(w2)
                y_pred_sf = y_pred.softmax(dim=1)

                # Compute the loss
                loss = -torch.log(y_pred_sf[range(images.shape[0]), labels]).mean()

                # Backpropagation
                grad_y_pred = y_pred_sf.clone()
                grad_y_pred[range(images.shape[0]), labels] -= 1
                grad_w2 = h_tan.t().mm(grad_y_pred)
                grad_h_tan = grad_y_pred.mm(w2.t())
                grad_h = grad_h_tan * (1 - h_tan**2)
                grad_w1 = images.t().mm(grad_h)

                # Update weights manually
                with torch.no_grad():
                    w1 -= learning_rate * grad_w1
                    w2 -= learning_rate * grad_w2

                total_loss += loss.item()

            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(trainloader)}')

        # Evaluation
        correct = 0
        total = 0
        with torch.no_grad():
            for x, y in testloader:
                x = x.view(x.shape[0], -1)
                h_tan = x.mm(w1).tanh()
                y_pred = h_tan.mm(w2)
                predictions = torch.argmax(y_pred, dim=1)
                total += y.size(0)
                correct += (predictions == y).sum().item()

        accuracy = (correct / total) * 100
        accuracies_epoch_for_hidden.append(accuracy)

        print(f"Epoch {epoch+1} Accuracy = {accuracy:.2f}%, Loss = {loss:.4f}")

    accuracies_epoch.append(accuracies_epoch_for_hidden)

# Plot the accuracy against the number of epochs
plt.figure(figsize=(12, 6))
for i, hidden_size in enumerate(hidden_sizes):
    plt.plot(epochs_list, accuracies_epoch[i], marker='o', label=f'Hidden Layers = {hidden_size}')

plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Epochs for Different Hidden Layer Sizes')
plt.legend()
plt.grid(True)
plt.show()

# Plot the accuracy against the number of hidden layers
plt.figure(figsize=(12, 6))
for i, epoch in enumerate(epochs_list):
    plt.plot(hidden_sizes, [accuracies_epoch[j][i] for j in range(len(hidden_sizes))], marker='o', label=f'Epochs = {epoch}')

plt.xlabel('Number of Hidden Layers')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Hidden Layers for Different Epochs')
plt.legend()
plt.grid(True)
plt.show()


Activation function - Relu

Epoch - 25, 50, 100, 150

Hidden layer - 25, 50, 100, 150

Stochastic Gradient descent

In [None]:
# Define hyperparameters
learning_rate = 0.001
epochs_list = [25, 50, 100, 150]
hidden_sizes = [25, 50, 100, 150]
accuracies_epoch = []
accuracies_hidden = []

# Define the model architecture
input_size = 784
output_size = len(trainloader.dataset.classes)

for hidden_size in hidden_sizes:
    accuracies_epoch_for_hidden = []
    for epochs in epochs_list:
        # Initialize weights with random values
        w1 = torch.randn(input_size, hidden_size, requires_grad=True)
        w2 = torch.randn(hidden_size, output_size, requires_grad=True)

        for epoch in range(epochs):
            # Training loop
            total_loss = 0.0
            for images, labels in trainloader:
                images = images.view(images.shape[0], -1)

                # Forward pass
                h = images.mm(w1)
                h_relu = h.clamp(min=0)
                y_pred = h_relu.mm(w2)
                y_pred_sf = y_pred.softmax(dim=1)

                # Compute the loss
                loss = -torch.log(y_pred_sf[range(images.shape[0]), labels]).mean()

                # Backpropagation
                grad_y_pred = y_pred_sf.clone()
                grad_y_pred[range(images.shape[0]), labels] -= 1  # Derivative of cross-entropy loss w.r.t. y_pred
                grad_w2 = h_relu.t().mm(grad_y_pred)
                grad_h_relu = grad_y_pred.mm(w2.t())
                grad_h = grad_h_relu * (h > 0).float()  # Derivative of ReLU activation
                grad_w1 = images.t().mm(grad_h)

                # Update weights manually
                with torch.no_grad():
                    w1 -= learning_rate * grad_w1
                    w2 -= learning_rate * grad_w2

                total_loss += loss.item()

            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(trainloader)}')

        # Evaluation
        correct = 0
        total = 0
        with torch.no_grad():
            for x, y in testloader:
                x = x.view(x.shape[0], -1)
                # Forward pass with ReLU activation
                h = x.mm(w1)
                h_relu = h.clamp(min=0)
                y_pred = h_relu.mm(w2)
                predictions = torch.argmax(y_pred, dim=1)
                total += y.size(0)
                correct += (predictions == y).sum().item()

        accuracy = (correct / total) * 100
        accuracies_epoch_for_hidden.append(accuracy)

        print(f"Epoch {epoch+1} Accuracy = {accuracy:.2f}%, Loss = {loss:.4f}")

    accuracies_epoch.append(accuracies_epoch_for_hidden)

# Plot the accuracy against the number of epochs
plt.figure(figsize=(12, 6))
for i, hidden_size in enumerate(hidden_sizes):
    plt.plot(epochs_list, accuracies_epoch[i], marker='o', label=f'Hidden Layers = {hidden_size}')

plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Epochs for Different Hidden Layer Sizes')
plt.legend()
plt.grid(True)
plt.show()

# Plot the accuracy against the number of hidden layers
plt.figure(figsize=(12, 6))
for i, epoch in enumerate(epochs_list):
    plt.plot(hidden_sizes, [accuracies_epoch[j][i] for j in range(len(hidden_sizes))], marker='o', label=f'Epochs = {epoch}')

plt.xlabel('Number of Hidden Layers')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs. Number of Hidden Layers for Different Epochs')
plt.legend()
plt.grid(True)
plt.show()
