<a href="https://colab.research.google.com/github/JacobAshoo/NNFS/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [120]:
import numpy as np
import torch
import torchvision
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [121]:
class Relu:
  def __init__(self):
    pass

  def __call__(self, x):
    self.x = x
    return torch.maximum(x, torch.tensor(0.0, device=x.device))

  def backward(self, grad):
    return grad * (self.x > 0).float()



class Softmax:
  def __init__(self, dim=-1, device=None):
    self.dim = dim
    self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")

  def __call__(self, x):
    x = x.to(self.device)
    x_max = torch.max(x, dim=self.dim, keepdim=True)[0]
    exp_x = torch.exp(x - x_max)
    sum_exp_x = torch.sum(exp_x, dim=self.dim, keepdim=True)
    self.softmax_output = exp_x / sum_exp_x
    return self.softmax_output

  def backward(self, grad_output):
    batch_size, num_classes = self.softmax_output.shape
    eye = torch.eye(num_classes, device=self.device).unsqueeze(0)
    softmax_diag = self.softmax_output.unsqueeze(2) * eye
    softmax_outer = torch.matmul(self.softmax_output.unsqueeze(2), self.softmax_output.unsqueeze(1))
    jacobian = softmax_diag - softmax_outer
    grad_input = torch.matmul(jacobian, grad_output.unsqueeze(2)).squeeze(2)
    return grad_input



In [122]:
class CrossEntropy:
  def __init__(self):
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  def __call__(self, ypred, y, l2_reg=False, l=0.01, weights=[]):
    self.l2_reg = l2_reg
    self.l = l
    self.weights = weights
    ypred = torch.clamp(ypred, min=1e-10).to(self.device)
    y = y.to(self.device)
    loss = -torch.mean(torch.sum(y * torch.log(ypred), dim=1))
    if l2_reg and weights:
      loss += (l / 2) * sum(torch.sum(w ** 2) for w in weights)
    return loss

  def backward(self, ypred, y):
    ypred = ypred.to(self.device)
    y = y.to(self.device)
    grad_output = -y / ypred
    if self.l2_reg and self.weights:
      for w in self.weights:
        grad_output += self.l * w
    return grad_output


In [123]:
class Linear:
  def __init__(self, input_size, output_size, activation, device=None, dropout=1.0):
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device
    self.input_size = input_size
    self.output_size = output_size
    self.w = torch.randn(output_size, input_size, device=self.device, requires_grad=True) * 0.01
    self.b = torch.zeros(output_size, device=self.device, requires_grad=True)
    self.activation = activation
    self.dropout = dropout

  def __call__(self, x, training=True):
    self.x = x.to(self.device)
    self.z = torch.matmul(self.x, self.w.T) + self.b
    self.a = self.activation(self.z)


    if training and self.dropout < 1.0:
      mask = (torch.rand(self.a.shape, device=self.device) < self.dropout).float()
      self.a *= mask
      self.a /= self.dropout

    return self.a

  def backward(self, grad):
    grad = self.activation.backward(grad)
    dw = torch.matmul(grad.T, self.x)
    db = torch.sum(grad, dim=0)
    dx = torch.matmul(grad, self.w)
    return dx, dw, db

In [124]:
class GradientDescent:
  def __init__(self, learning_rate, beta):
    self.lr = learning_rate
    self.beta = beta

  def optimize(self, dw, db, layer):
    if not hasattr(self, 'vw') or self.vw.shape != dw.shape:
        self.vw = torch.zeros_like(dw)
        self.vb = torch.zeros_like(db)

    self.vw = (self.beta * self.vw) + ((1 - self.beta) * dw)
    self.vb = (self.beta * self.vb) + ((1 - self.beta) * db)

    with torch.no_grad():
      layer.w -= self.lr * self.vw
      layer.b -= self.lr * self.vb

In [125]:
class Adam:
  def __init__(self, learning_rate, beta1, beta2):
    self.lr = learning_rate
    self.beta1 = beta1
    self.beta2 = beta2

  def optimize(self, dw, db, layer, t):
    if not hasattr(self, 'vw') or self.vw.shape != dw.shape:
        self.vw = torch.zeros_like(dw)
        self.vb = torch.zeros_like(db)
        self.sw = torch.zeros_like(dw)
        self.sb = torch.zeros_like(db)

    self.vw = (self.beta1 * self.vw) + ((1 - self.beta1) * dw)
    self.vb = (self.beta1 * self.vb) + ((1 - self.beta1) * db)
    self.sw = (self.beta2 * self.sw) + ((1 - self.beta2) * dw * dw)
    self.sb = (self.beta2 * self.sb) + ((1 - self.beta2) * db * db)

    self.vw = self.vw/(1-pow(self.beta1,t+1))
    self.vb = self.vb/(1-pow(self.beta1,t+1))
    self.sw = self.sw/(1-pow(self.beta2,t+1))
    self.sb = self.sb/(1-pow(self.beta2,t+1))

    with torch.no_grad():
      layer.w -= self.lr * (self.vw/(torch.sqrt(self.sw) + 1e-8))
      layer.b -= self.lr * (self.vb/(torch.sqrt(self.sb) + 1e-8))

In [126]:
class FCNN:
  def __init__(self, dropout=False):
    self.l1 = Linear(3072, 1024, Relu())
    self.l2 = Linear(1024, 512, Relu())
    self.l3 = Linear(512, 64, Relu())
    self.l4 = Linear(64, 10, Softmax())
    self.layers = [self.l1, self.l2, self.l3, self.l4]


  def forward(self, x):
    x = self.l1(x)
    x = self.l2(x)
    x = self.l3(x)
    x = self.l4(x)
    return x

  def __call__(self, x):
    return self.forward(x)




In [127]:
data = torchvision.datasets.CIFAR10(root="/content",download=True)

xs = []
labels = []

for i in range(1,6):
  with open(f"/content/cifar-10-batches-py/data_batch_{i}", 'rb') as f:
    dict = pickle.load(f, encoding='bytes')
    xs.append(dict[b'data'])
    labels += dict[b'labels']


labels = np.array(labels).reshape(-1,1)
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(labels)

xs = np.array(xs)
x = xs.reshape(50000, 3072) / 255

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.166, random_state=42)
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)

train_dataset = TensorDataset(X_train, y_train)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)




Files already downloaded and verified


In [128]:
def get_lr(epoch, initial_lr=1e-3):
    if epoch < 5:
        return initial_lr * min(1.0, float(epoch + 1) / 5)
    elif epoch < 30:
        return initial_lr
    elif epoch < 40:
        return initial_lr * 0.1
    else:
        return initial_lr * 0.01

In [129]:
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device}")


model = FCNN()
lr = 2e-5


loss_function = CrossEntropy()
optimizer = Adam(learning_rate=lr, beta1=.9, beta2=.99)
#optimizer = GradientDescent(learning_rate=1e-3, beta=.9)



epochs = 50

for epoch in range(epochs):
  losses = []
  n_total = 0
  n_correct = 0

  optimizer.lr = get_lr(epoch, lr)

  for (iteration, (x,y)) in enumerate(train_loader):
    ypred = model(x)
    loss = loss_function(ypred, y)
    losses.append(loss)

    n_total += batch_size
    guesses = torch.argmax(ypred, dim=1)
    truths = torch.argmax(y, dim=1)

    for i in range(len(guesses)):
      if(guesses[i]==truths[i]):
        n_correct += 1


    # Backpropagation
    grad = loss_function.backward(ypred, y)
    dx = grad

    for layer in reversed(model.layers):
      dx, dw, db = layer.backward(dx)
      optimizer.optimize(dw, db, layer, iteration)

  print(f"epoch {epoch} loss: {sum(losses)/len(losses)} accuracy: {n_correct/n_total}")


Using cuda
epoch 0 loss: 2.2982895374298096 accuracy: 0.18570264570552147
epoch 1 loss: 2.204258441925049 accuracy: 0.1875
epoch 2 loss: 2.0863397121429443 accuracy: 0.19037576687116564
epoch 3 loss: 2.0453364849090576 accuracy: 0.22136215490797545
epoch 4 loss: 1.9854316711425781 accuracy: 0.26754217791411045
epoch 5 loss: 1.9249241352081299 accuracy: 0.29426284509202455
epoch 6 loss: 1.8925057649612427 accuracy: 0.31120590490797545
epoch 7 loss: 1.8689824342727661 accuracy: 0.31731690950920244
epoch 8 loss: 1.8494713306427002 accuracy: 0.3252971625766871
epoch 9 loss: 1.8294126987457275 accuracy: 0.33217503834355827
epoch 10 loss: 1.8093458414077759 accuracy: 0.34176092791411045
epoch 11 loss: 1.7902758121490479 accuracy: 0.3482553680981595
epoch 12 loss: 1.7712310552597046 accuracy: 0.35611579754601225
epoch 13 loss: 1.753353476524353 accuracy: 0.3631134969325153
epoch 14 loss: 1.7387148141860962 accuracy: 0.368888995398773
epoch 15 loss: 1.7233991622924805 accuracy: 0.3739455521472

KeyboardInterrupt: 