# **Softmax Regression from Scratch**
We use the Fashion-MNIST data set with batch size 256.

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time

batch_size = 256
transform = transforms.Compose([transforms.ToTensor(),]) 

mnist_train = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
mnist_test = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size,
    shuffle=True, num_workers=0)
test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size,
    shuffle=True, num_workers=0)

num_epochs, lr = 5, 0.1

# **The Softmax**
We can now define the softmax function. For that we rst exponentiate each term using $\exp$ and then sum each row to get the normalization constant. Last we divide each row by its normalization constant and return the result.

$\textrm{softmax}(\mathbf{X})_{ij}=\frac{\exp(X_{ij})}{\sum_k\exp (X_{ik})}$

In [2]:
def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition # The broadcast mechanism is applied here.

In [3]:
X = torch.normal(0, 1, size=(2, 5))
X_prob = softmax(X)
print(X_prob, X_prob.sum(axis=1))

tensor([[0.0943, 0.3198, 0.2472, 0.1737, 0.1650],
        [0.2237, 0.4788, 0.0664, 0.0415, 0.1896]]) tensor([1.0000, 1.0000])


# **The Model and Parameters Initialization**
Since each example is an image with $28\times 28$ pixels we can store it as a 784 dimensional
vector. Moreover, since we have 10 categories, the single layer network has an output
dimension of 10.

In [4]:
class Flatten(torch.nn.Module):
    def forward(self, x):
        return x.view(-1,784)

net = nn.Sequential(Flatten(), nn.Linear(784, 10))

def init_weights(m):
    if type(m) == nn.Linear:
        # Initialize weight parameter by a normal distribition 
        # with a mean of 0 and standard deviation of 0.01.
        nn.init.normal_(m.weight.data, std=0.01)
        # The bias parameter is initialized to zero by default.
        m.bias.data.fill_(0.0)

net.apply(init_weights)

Sequential(
  (0): Flatten()
  (1): Linear(in_features=784, out_features=10, bias=True)
)

# **The Loss Function**

In [5]:
loss = nn.CrossEntropyLoss()

# **Optimization Algorithm**
We use SGD with a learning rate of 0.1, just as in linear regression.

In [6]:
trainer = optim.SGD(net.parameters(), lr=lr)

# **Classication Accuracy**
Given a class of predicted probability distributions *y_hat* , we use the one with the highest
predicted probability as the output category.

In [7]:
def evaluate_accuracy(data_iter, net):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = 0,0
    for (imgs, labels) in data_iter:
        net.eval()
        with torch.no_grad():
            labels = labels.float()
            acc_sum += torch.sum((torch.argmax(net(imgs), dim=1) == labels)).float()
            n += labels.shape[0]
    return acc_sum.item()/n

Because we initialized the net model with random weights, the accuracy of this model should be close to random guessing, i.e. 0.1 for 10 classes.

In [8]:
evaluate_accuracy(test_loader, net)

0.056

# **Model Training**

In [9]:
for epoch in range(1, num_epochs+1):
    train_loader_iter = iter(train_loader)
    train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
    
    for (imgs, labels) in train_loader_iter:
        net.train()
        y_hat = net(imgs)
        l = loss(y_hat, labels)
        # Backprobagation
        l.backward()
        trainer.step()

        # Calculate tarining error
        with torch.no_grad():
            labels = labels.float()
            train_l_sum += l.item()
            train_acc_sum += (torch.sum(torch.argmax(y_hat, dim=1) == labels)).float().item()
            n += labels.shape[0]
    # calculate testing error every epoch.
    test_acc = evaluate_accuracy(iter(test_loader), net)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
          % (epoch, train_l_sum/n, train_acc_sum/n, test_acc,
            time.time() - start))

epoch 1, loss 0.0097, train acc 0.768, test acc 0.814, time 5.5 sec
epoch 2, loss 0.0184, train acc 0.804, test acc 0.799, time 5.8 sec
epoch 3, loss 0.0291, train acc 0.804, test acc 0.798, time 8.8 sec
epoch 4, loss 0.0365, train acc 0.814, test acc 0.804, time 5.5 sec
epoch 5, loss 0.0366, train acc 0.814, test acc 0.794, time 5.6 sec
