# Softmax Regression from Scratch

We use the Fashion-MNIST data set with batch size 256.

In [2]:
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
import numpy as np
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import torch.optim as optim
import time
from torch.utils.data import DataLoader
batch_size = 256

# Load Fashion MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor()
])

train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_iter = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

num_epochs, lr = 5, 0.1

100%|██████████| 26.4M/26.4M [00:13<00:00, 2.01MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 197kB/s]
100%|██████████| 4.42M/4.42M [00:04<00:00, 990kB/s] 
100%|██████████| 5.15k/5.15k [00:00<00:00, 6.13MB/s]


## The Softmax

We can now define the softmax function. For that we rst exponentiate each term using exp
 and then sum each row to get the normalization constant. Last we divide each row by its normalization constant and return the result.



In [3]:
def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition # The broadcast mechanism is applied here.
     

In [4]:
X = torch.normal(0, 1, size=(2, 5))
X_prob = softmax(X)
print(X_prob, X_prob.sum(axis=1))

tensor([[0.3210, 0.1204, 0.2654, 0.1163, 0.1770],
        [0.0546, 0.0445, 0.2507, 0.6212, 0.0289]]) tensor([1.0000, 1.0000])


## The Model and Parameters Initialization

Since each example is an image with 28x28
 pixels we can store it as a 784 dimensional vector. Moreover, since we have 10 categories, the single layer network has an output dimension of 10.

In [5]:
class Flatten(torch.nn.Module):
    def forward(self, x):
        return x.view(-1,784)

net = nn.Sequential(Flatten(), nn.Linear(784, 10))

def init_weights(m):
    if type(m) == nn.Linear:
        # Initialize weight parameter by a normal distribition 
        # with a mean of 0 and standard deviation of 0.01.
        nn.init.normal_(m.weight.data, std=0.01)
        # The bias parameter is initialized to zero by default.
        m.bias.data.fill_(0.0)

net.apply(init_weights)

Sequential(
  (0): Flatten()
  (1): Linear(in_features=784, out_features=10, bias=True)
)

## LOSS FUNCTION


In [6]:

loss = nn.CrossEntropyLoss()
     

## Optimization Algorithm

We use SGD with a learning rate of 0.1, just as in linear regression.

In [7]:
trainer = optim.SGD(net.parameters(), lr=lr)

## Classication Accuracy
Given a class of predicted probability distributions y_hat , we use the one with the highest predicted probability as the output category.