In [6]:
import torch
import torchvision #library for working with images
from torchvision.datasets import MNIST
import matplotlib.pyplot as plt
%matplotlib inline 
#matplotlib inline signifies that we want to see output in our ide, not as a popup
import torchvision.transforms as transforms #contains predefined functions to convert images to tensors
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler #allows us to sample elements randomly from list of indices but not needed with new shuffle feature
from torch.utils.data.dataloader import DataLoader
import torch.nn as nn
import torch.nn.functional as F

dataset = MNIST(root = 'data/', download = True) #downloads data set, parameter a is location, b is whther it should be downloaded
# print("size of training/validation dataset")
# print(len(dataset)) #gives size of dataset

testDataset = MNIST(root = 'data/', train = False) #by passing train = false, we signify that we want the other images (the testing ones)
# print("size of testing set")
# print(len(testDataset))

dataset[0] #calls first image in training set

#show an individual image
image, label, = dataset[0] #deriving the image and the label from the values at index 0 
# plt.imshow(image, cmap = 'gray')
# print('Label :', label)

dataset = MNIST(root = 'data/', train = True, transform = transforms.ToTensor()) #a = data root folder, b = whether it is a training set or not, c = transformation type bc u cant just pass in a raw image

#printing image compressed into a tensor
imgTensor, label = dataset[0] #deriving the image and the label from the values at index 0
# print(imgTensor.shape, label) #print shape of tensor with label, prints out like torch.Size([1, 28, 28]) 5 
#first parameter is only one channel since its grayscale, keeps track of color channels, if it was RGB it would be 3, the other 2 are size

# print(imgTensor[:, 10:15, 10:15]) #prints a tensor containing values for the part of the image of 10-15x and 10-15 y 
# print(torch.max(imgTensor), torch.min(imgTensor)) #prints 1 and 0 , 1 is max (white), 0 is low, (black)

# plt.imshow(imgTensor[0, 10:15, 10:15], cmap = 'gray') #displays chunk as image, also overrides the other plt image show 

#training set is used to train model
#validation set is used to evaluate model and adjust hyperparameters like the learning rate to pick the best veresion of the model so like while training used to calculate a metric
#test set is used to compare different models and report final accuracy

def splitIndices(n, valPct): #split data set into validation and training set, n is number of images, valPct is number you want to be validation set
    nVal = int(valPct * n) #multiplying to find # of images to make validation
    idxs = np.random.permutation(n) #creates a random permutation of n images from 0 to n-1 in the list of images
    return idxs[nVal:], idxs[:nVal] #picks the first nVal indices to be used for validation set and returns trianing images and validation images split up and shuffled

trainIndices, valIndices = splitIndices(len(dataset), valPct = 0.2) #sets training set and val set to output of function, input length of dataset as n, and valPct of 20 percent

#we shuffle because the data might be in order, the only time you want it to be in order is for a problem that has to do with time for example, which needs to be in order

#trainIndices should be 48k, valIndices is 12k 

batchSize = 100 #training in batches is more efficient and allows it to fit in memory if that is a problem

#data loaders help us load data into batches
#sampler takes indices randomly but instead of using that...
# trainLoader = DataLoader(trainIndices, batchSize, shuffle = True) #takes in dataset, the size per epoch, and whether data should be shuffled 
# valLoader = DataLoader(valIndices, batchSize, shuffle = True)

#alternate way of doing this
trainSampler = SubsetRandomSampler(trainIndices)
trainLoader = DataLoader(dataset, batchSize, sampler = trainSampler)

valSampler = SubsetRandomSampler(valIndices)
valLoader = DataLoader(dataset, batchSize, sampler = valSampler)

inputSize = 28*28 #28*28 pixels
numClasses = 10 #0-9

model = nn.Linear(inputSize, numClasses) #create model (logistic regression)

# print(model.weight.shape)
# model.weight

# print(model.bias.shape)
# model.bias

# for images, labels in trainLoader:
#     print(labels)
#     print(images.shape)
#     outputs = model(images) #creates size mitchmatch error expects batch of vectors of size 28*28 but its getting 100*1*28*28 torch.Size([100, 1, 28, 28]) so flatten it into a single vector
#     break

class MnistModel(nn.Module): #extending nn.Module to reshape into flattened out 28x28 vectors
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(inputSize, numClasses) #instantiatae the weights and biases using nn.Linear

    def forward(self, xb): #flattens out input vector and pass into model
        xb = xb.reshape(-1, 784) #reshape vector
        out = self.linear(xb)
        return out

model = MnistModel() #new with reshaped vectors

# print(model.linear.weight.shape, model.linear.bias.shape) 
# print(model.parameters())

for images, labels in trainLoader: 
     outputs = model(images)
     break

# print('ouputs.shape : ', outputs.shape) its 100, 10 ouputs
# print('Sample outputs : \n', outputs [:2].data) ouputs 10 for each image, probabilities for each thing need to convert to real probabilities tho cuz they just numbers so softmax

probs = F.softmax(outputs, dim = 1) #dim = which dimension to appply softmax to i think
# probs = torch.sigmoid(outputs)
# probs = torch.relu(outputs)
probs = F.silu(outputs)

maxProbs, preds = torch.max(probs, dim = 1) #takes highest probability to classify as n number

# def accuracy (l1, l2): #gives accuracy of predictions
#     return torch.sum(l1 == l2).item() / len(l1)

#accuracy(preds, labels) output * 100 = number of correct answers out of 100

#cannot be used as a loss function as it doesnt take into account probabilities or how close they are, it just takes an ouput of one number 

#good loss function for this is - sum of actual label natural log probability of what model thought of right label

lossFn = F.cross_entropy

loss = lossFn(outputs, labels) #output is log of accuracy basically so like 2.3 is like .1 accuracy

learningRate = 0.001 #example of hyper parameters
optimizer  = torch.optim.SGD(model.parameters(), lr = learningRate) #updates weights and biases during GD

#also optionally computes a metric like accuracy
def lossBatch(model, lossFunc, xb, yb, opt = None, metric = None): #calculates loss of batch of data optionally pefroms gradient descent update step if optimizer is provided
   #calc loss
    preds = model(xb) # xb is input tensors into model
    loss = lossFunc(preds, yb) 

    if opt is not None: 
        loss.backward() #compute gradients
        opt.step() #update params
        opt.zero_grad()#reset gradients

    metricResult = None
    if metric is not None:
        metricResult = metric(preds, yb) #passes values into metric function, yb is actual labels
        
    return loss.item(), len(xb), metricResult

#calculates overall loss and a metric and also outputs total size of all batches together 
def evaluate(model, lossFn, validDl, metric = None):
    with torch.no_grad(): #dont need to compute gradients with validation set, only for evaluation
        results = [lossBatch(model, lossFn, xb, yb, metric = metric) 
                                    for xb, yb in validDl] #passes each batch through the model
        
        #seperate
        losses, nums, metrics = zip(*results)

        #total size is sum of all batch sizes
        total = np.sum(nums)

        avgLoss = np.sum(np.multiply(losses, nums)) / total #avg loss
        avgMetric = None

        if metric is not None:
            #avg metric of assessment across all batches
            avgMetric = np.sum(np.multiply(metrics, nums)) / total
    
    return avgLoss, total, avgMetric

#redefine accuracy to work with an entire batch
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim = 1) #highest value (the one it thinks it is)
    return torch.sum(preds == labels).item() / len(preds) #how many are correct

#current results before training valLoss, total, valAcc = evaluate(model, lossFn, valLoader, metric = accuracy)

#training 
def fit(epochs, model, lossFn, opt, trainDl, validDl, metric = None):
    for epoch in range(epochs):

        #training
        for xb, yb in trainDl:
            loss,_,_ = lossBatch(model, lossFn, xb, yb, opt)

        #evaluation
        result = evaluate(model, lossFn, validDl, metric)
        valLoss, total, valMetric = result
        #prints out progress through training
        
        if metric is None:
            print('Epoch [{}/{}], Loss: {:.4f}'
            .format(epoch+1, epochs, valLoss))
        
        else:
            print('Epoch [{}/{}], Loss: {:.4f}, {}: {:.4f}'
            .format(epoch+1, epochs, valLoss, "Accuracy", valMetric))

fit(20, model, F.cross_entropy, optimizer, trainLoader, valLoader, accuracy)   

#after 100 epochs it achieved accuracy of 89.26%. it isnt higher because model isnt that powerful "The more likely reason that the model just isn't powerful enough. If you remember our initial hypothesis, we have assumed that the output (in this case the class probabilities) is a linear function of the input (pixel intensities), obtained by perfoming a matrix multiplication with the weights matrix and adding the bias. This is a fairly weak assumption, as there may not actually exist a linear relationship between the pixel intensities in an image and the digit it represents. While it works reasonably well for a simple dataset like MNIST (getting us to 85% accuracy), we need more sophisticated models that can capture non-linear relationships between image pixels and labels for complex tasks like recognizing everyday objects, animals etc."


# doesnt work for some reason      
# def predictImage(img, model):
#     xb = img.unsqueeze()
#     yb = model(xb)
#     _, preds = torch.max(yb, dim = 1)
#     return preds[0].item()
    
# img, label = testDataset[0]
# plt.imshow(img, cmap = 'gray')
# print('Label:', label, ', Predicted:', predictImage(img, model)) s





TypeError: linear() missing 1 required positional arguments: "weight"