In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# This new configuration allows for the use of GPU acceleration on Apple Silicon Macs
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.has_mps:
    device = 'mps'
print(device)
torch.manual_seed(189898) # Last 6 digits of my A# without the leading zeros

mps


<torch._C.Generator at 0x109895a90>

In [2]:
# Check your Current Working Directory
!pwd

/Users/jagardiner/Desktop/STAT-6685/HW3


In [3]:
#Set Batch Size
batch_size = 32

# Download the MNIST dataset to local drive. A new folder "data" will be created in teh current directory to store data
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor())

# Use a data loader to shuffle and batch
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [4]:
# Hyperparameters
# Network Architecture
input_size = 784
num_classes = 10

# Training Parameters
num_epochs = 6

# Fully connected neural network with two hidden layers
class NeuralNet(nn.Module):
    def __init__(self, in_size, h1, h2, n_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(in_size, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.relu = nn.ReLU()
        self.fc3 = nn.Linear(h2, n_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out
    
# Define the Loss Function and Optimizer
criterion = torch.nn.CrossEntropyLoss()

In [5]:
# Train the model
learning_rates = [0.05, 0.1]
hidden_sizes = [[1568, 1568], [3136, 1568], [1568, 784]] # These values should all produce 97+% accuracy
best_performer = 0

for hidden_size in hidden_sizes:
    for learning_rate in learning_rates:
        total_step = len(train_loader)
        # Define the model object and the optimizer
        model = NeuralNet(input_size, hidden_size[0], hidden_size[1], num_classes).to(device)
        optimizer = torch.optim.SGD(params=model.parameters(), lr=learning_rate)
        for epoch in range(num_epochs):
            for i, (images, labels) in enumerate(train_loader):
                # Move tensors to the configured device
                images = images.reshape(-1, 28*28).to(device)
                labels = labels.to(device)

                # Forward pass
                outputs = model.forward(images)
                loss = criterion(outputs, labels)

                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Print progress every 100 steps
                if (i+1) % 100 == 0:
                    print(f'Epoch[{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {round(float(loss.item()), 4)}')

    # Test the model once you finish training
        with torch.no_grad(): # In test phase we don't need to compute gradients (for memory efficiency)
            correct = 0
            total = 0
            for images, labels in test_loader:
                images = images.reshape(-1, 28*28).to(device)
                labels = labels.to(device)

                # get network outputs
                outputs = model.forward(images)
                throwaway, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            
            if correct > best_performer:
                # Save the best performing model for future use
                best_performer = correct
                torch.save(model.state_dict(), 'model.ckpt')
            print(f"Accuracy of the network for the 10,000 test images: {(100 * correct / total)}%, with learning rate: {learning_rate}, and {hidden_size} hidden neurons")
            print("\n\n")


Epoch[1/6], Step [100/1875], Loss: 0.7621
Epoch[1/6], Step [200/1875], Loss: 0.3125
Epoch[1/6], Step [300/1875], Loss: 0.4455
Epoch[1/6], Step [400/1875], Loss: 0.3216
Epoch[1/6], Step [500/1875], Loss: 0.5484
Epoch[1/6], Step [600/1875], Loss: 0.2955
Epoch[1/6], Step [700/1875], Loss: 0.18
Epoch[1/6], Step [800/1875], Loss: 0.4373
Epoch[1/6], Step [900/1875], Loss: 0.1879
Epoch[1/6], Step [1000/1875], Loss: 0.1108
Epoch[1/6], Step [1100/1875], Loss: 0.2354
Epoch[1/6], Step [1200/1875], Loss: 0.2111
Epoch[1/6], Step [1300/1875], Loss: 0.4518
Epoch[1/6], Step [1400/1875], Loss: 0.3937
Epoch[1/6], Step [1500/1875], Loss: 0.27
Epoch[1/6], Step [1600/1875], Loss: 0.0849
Epoch[1/6], Step [1700/1875], Loss: 0.2242
Epoch[1/6], Step [1800/1875], Loss: 0.2007
Epoch[2/6], Step [100/1875], Loss: 0.129
Epoch[2/6], Step [200/1875], Loss: 0.1488
Epoch[2/6], Step [300/1875], Loss: 0.1768
Epoch[2/6], Step [400/1875], Loss: 0.2273
Epoch[2/6], Step [500/1875], Loss: 0.1346
Epoch[2/6], Step [600/1875], L

In [6]:
# originally tested a range from 0.001 to 10 with order of magnitude increases for lr, 1578*1578, 784*1568, and 1568*784 hidden layer sizes with batch size of 20 and trained for 10 epochs
# For lr >= 1, loss=nan, result=random guessing
# 98.16%, 0.1, [1568,784]
# 96.57%, 0.01, [1568,784]
# 91.27%, 0.001, [1568,784]
# 97.62%, 0.1, [784, 1568]
# 96.53%, 0.01, [784, 1568]
# 91.20%, 0.001, [784, 1568]
# 98.09%, 0.1, [1568, 1568]
# 96.89%, 0.01, [1568, 1568]
# 91.24%, 0.001, [1568, 1568]

