In [98]:
import torch
import torch.nn as nn
import torchvision.datasets as data
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.functional as F

### Downloading MNIST Datasets 

In [99]:
train_data=data.MNIST(root='./data',train=True,transform=transforms.ToTensor(),download=True)
test_data=data.MNIST(root='./data',train=False,transform=transforms.ToTensor())

In [100]:
train_data

Dataset MNIST
    Number of datapoints: 60000
    Split: train
    Root Location: ./data
    Transforms (if any): ToTensor()
    Target Transforms (if any): None

### About Datasets
We have got 60,000 training samples ,need to split into small batches,so that less computational stress on RAM or CPU.

In [101]:
batch_size=100
n_iters=3000 
num_epochs=int(n_iters/(len(train_data)/batch_size))
train_loader=torch.utils.data.DataLoader(dataset=train_data,batch_size=batch_size,shuffle=True)
test_loader=torch.utils.data.DataLoader(dataset=test_data,batch_size=batch_size,shuffle=True)

### Epochs
Epoch means we have sent the whole 60,000 datasets successfully.

In [102]:
class Net(nn.Module):
    def __init__(self,input_dim,hidden_dim,output_dim):
        super(Net,self).__init__()
        self.fc1=nn.Linear(input_dim,hidden_dim)
        self.relu=nn.ReLU()
        # Linear function 2: 100 --> 100
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        # Non-linearity 2
        self.relu2 = nn.ReLU()

        self.fc3=nn.Linear(hidden_dim,output_dim)
    
    def forward(self,x):
        out=self.fc1(x)
        out=self.relu(out)
        out=self.fc2(out)
        out=self.relu2(out)
        out=self.fc3(out)
        return out

### Model Class
Input dim:784
->size of image=28*28(width *height)

Output dim:10
->0,1,2,3,4,5,6,7,8,9

Hidden dim:100
->can be any number 
->No. of neurons 

Interesting part is determining the no.of nerons.Intutively we think bigger model means better model but a bigger model requires more training samples to learn and converge to a good model.Since it is problem of recognizing digits, we typically would not need a big model to achieve the required result.

On the other side,too small of a hidden size would mean would be insufficient model capacity to predict competently.too small of a capacity implies a smaller brain capacity so no matter how many training samples we give it ,has a maximum capacity in terms of its predictive power.


In [103]:
input_dim=28*28
hidden_dim=100
output_dim=10
learning_rate=0.001

In [104]:
net=Net(input_dim,hidden_dim,output_dim)

In [109]:
criteria=nn.CrossEntropyLoss()
#optimizer=torch.optim.Adam(net.parameters(),lr=learning_rate)
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)

### Loss and Optimizers
Cross entropy function is required to compute the loss between our softmax outputs and binary labels.

OPTIMIZER:
Equation:(parametres=parametre-learning_rate*parametres_gradients)



At every iterations,we update our parametre.
Learning rate determines how fats the algorithm learns.Too small and the algorithm learns too slowly,too large and the algorithm learns too fast resulting in instabilities.


In [110]:
print(net.parameters())
print(len(list(net.parameters())))
print(list(net.parameters())[0].size())
print(list(net.parameters())[1].size())
print(list(net.parameters())[2].size())
print(list(net.parameters())[3].size())

<generator object Module.parameters at 0x000001F82809AD00>
6
torch.Size([100, 784])
torch.Size([100])
torch.Size([100, 100])
torch.Size([100])


In [111]:
iter=0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images with gradient accumulation capabilities
        images = images.view(-1, 28*28).requires_grad_()

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()

        # Forward pass to get output/logits
        outputs = net(images)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criteria(outputs, labels)

        # Getting gradients w.r.t. parameters
        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                # Load images with gradient accumulation capabilities
                images = images.view(-1, 28*28).requires_grad_()

                # Forward pass only to get logits/output
                outputs = net(images)

                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)

                # Total number of labels
                total += labels.size(0)

                # Total correct predictions
                correct += (predicted == labels).sum()

            accuracy = 100 * correct / total

            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

Iteration: 500. Loss: 0.025853915140032768. Accuracy: 97
Iteration: 1000. Loss: 0.0422731414437294. Accuracy: 97
Iteration: 1500. Loss: 0.016388077288866043. Accuracy: 97
Iteration: 2000. Loss: 0.02876640111207962. Accuracy: 97
Iteration: 2500. Loss: 0.032493989914655685. Accuracy: 97
Iteration: 3000. Loss: 0.015239179134368896. Accuracy: 97
