<a href="https://colab.research.google.com/github/JacobAshoo/NNFS/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
TODO:

padding



"""

'\nTODO:\n\npadding\n\n\n\n'

In [2]:
!pip install dill
import numpy as np
import torch
import torchvision.datasets
import dill
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from math import cos, pi
import wandb

torch.manual_seed(42)




<torch._C.Generator at 0x79704e7123b0>

In [3]:
for i in range(torch.cuda.device_count()):
  print(f"Device {i}: {torch.cuda.get_device_name(i)}")

In [269]:

!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mjacob-ashoo[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [270]:
class Relu:
  def __init__(self):
    pass

  def __call__(self, x):
    self.x = x
    return torch.maximum(x, torch.tensor(0.0, device=x.device))

  def backward(self, grad):
    return grad * (self.x > 0).float()



class Softmax:
  def __init__(self, dim=-1, device=None):
    self.dim = dim
    self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")

  def __call__(self, x):
    x = x.to(self.device)
    x_max = torch.max(x, dim=self.dim, keepdim=True)[0]
    exp_x = torch.exp(x - x_max)
    sum_exp_x = torch.sum(exp_x, dim=self.dim, keepdim=True)
    self.softmax_output = exp_x / sum_exp_x
    return self.softmax_output

  def backward(self, grad_output):
    batch_size, num_classes = self.softmax_output.shape
    eye = torch.eye(num_classes, device=self.device).unsqueeze(0)
    softmax_diag = self.softmax_output.unsqueeze(2) * eye
    softmax_outer = torch.matmul(self.softmax_output.unsqueeze(2), self.softmax_output.unsqueeze(1))
    jacobian = softmax_diag - softmax_outer
    grad_input = torch.matmul(jacobian, grad_output.unsqueeze(2)).squeeze(2)
    return grad_input


In [271]:
class CrossEntropy:
  def __init__(self, device=None):
    self.device = device
    self.l2_reg = False
    self.l = 0.01
    self.weights = None
    self.softmax = Softmax(dim=-1, device=self.device)

  def __call__(self, logits, y, l2_reg=False, l=0.01, weights=None):
    self.l2_reg = l2_reg
    self.l = l
    self.weights = weights if weights is not None else []

    probs = self.softmax(logits)
    self.probs = torch.clamp(probs, min=1e-8)

    loss = -torch.mean(torch.sum(y * torch.log(self.probs), dim=1))

    if self.l2_reg and self.weights:
      loss += (self.l / 2) * sum(torch.sum(w ** 2) for w in self.weights)

    return loss

  def backward(self, logits, y):
    batch_size = logits.shape[0]
    grad = (self.probs - y) / batch_size

    if self.l2_reg and self.weights:
      for w in self.weights:
        w.grad = self.l * w

    return grad

In [272]:
class Linear:
  def __init__(self, input_size, output_size, activation, device=None, dropout=1.0):
    self.device = device
    self.input_size = input_size
    self.output_size = output_size
    self.w = torch.randn(output_size, input_size, device=self.device) * .1
    self.b = torch.zeros(output_size, device=self.device)
    self.activation = activation
    self.dropout = dropout

  def __call__(self, x, training=True):
    self.x = x.to(self.device)
    self.z = torch.matmul(self.x, self.w.T) + self.b
    self.a = self.activation(self.z)


    if training and self.dropout < 1.0:
      mask = (torch.rand(self.a.shape, device=self.device) < self.dropout).float()
      self.a *= mask
      self.a /= self.dropout

    return self.a

  def backward(self, grad):
    if type(self.activation) == Softmax:
      grad = self.activation.backward(grad)
    dw = torch.matmul(grad.T, self.x)
    db = torch.sum(grad, dim=0)
    dx = torch.matmul(grad, self.w)
    return dx, dw, db

In [274]:
class Conv2D:
  def __init__(self, in_channels, out_channels, kernel_size, activation, stride=1, padding=0, device=None):
    self.device = device
    self.in_channels = in_channels
    self.out_channels = out_channels
    self.kernel_size = kernel_size
    self.activation = activation
    self.stride = stride
    self.padding = padding
    self.w = torch.randn(out_channels, in_channels, kernel_size, kernel_size, device=self.device) * 0.1
    self.b = torch.zeros(out_channels, device=self.device)
    self.original_shape = None

  def __call__(self, x):
    return self.forward(x)

  def forward(self, x):
    self.original_shape = x.shape
    self.x = x.to(self.device)
    batch_size, in_channels, in_height, in_width = x.shape

    padded_x = F.pad(self.x, (self.padding, self.padding, self.padding, self.padding))

    out_height = (in_height + 2 * self.padding - self.kernel_size) // self.stride + 1
    out_width = (in_width + 2 * self.padding - self.kernel_size) // self.stride + 1

    x_unfolded = F.unfold(padded_x, kernel_size=self.kernel_size, stride=self.stride).to(self.device)
    w_reshaped = self.w.view(self.out_channels, -1)

    out = torch.matmul(w_reshaped, x_unfolded) + self.b.view(-1, 1)
    out = out.view(batch_size, self.out_channels, out_height, out_width)
    self.a = self.activation(out)
    return self.a

  def backward(self, grad):
    if grad.dim() == 2:
      batch_size = self.original_shape[0]
      out_height = (self.original_shape[2] + 2 * self.padding - self.kernel_size) // self.stride + 1
      out_width = (self.original_shape[3] + 2 * self.padding - self.kernel_size) // self.stride + 1
      grad = grad.reshape(batch_size, self.out_channels, out_height, out_width)

    batch_size, _, out_height, out_width = grad.shape
    grad_flattened = grad.reshape(batch_size, self.out_channels, -1)
    padded_x = F.pad(self.x, (self.padding, self.padding, self.padding, self.padding))
    x_unfolded = F.unfold(padded_x, kernel_size=self.kernel_size, stride=self.stride)

    dw = torch.matmul(grad_flattened, x_unfolded.transpose(1, 2))
    dw = dw.sum(dim=0).view(self.w.shape)
    db = grad_flattened.sum(dim=(0, 2))

    w_reshaped = self.w.view(self.out_channels, -1)
    dx_unfolded = torch.matmul(w_reshaped.T, grad_flattened)
    dx = F.fold(dx_unfolded, output_size=(self.original_shape[2] + 2 * self.padding, self.original_shape[3] + 2 * self.padding), kernel_size=self.kernel_size, stride=self.stride)

    if self.padding > 0:
      dx = dx[:, :, self.padding:-self.padding, self.padding:-self.padding]

    return dx, dw, db


In [275]:
class GradientDescent:
  def __init__(self, learning_rate, beta):
    self.lr = learning_rate
    self.beta1 = beta

  def optimize(self, dw, db, layer):
    if not hasattr(self, 'vw') or self.vw.shape != dw.shape:
        self.vw = torch.zeros_like(dw)
        self.vb = torch.zeros_like(db)

    self.vw = (self.beta1 * self.vw) + ((1 - self.beta1) * dw)
    self.vb = (self.beta1 * self.vb) + ((1 - self.beta1) * db)

    layer.w -= self.lr * self.vw
    layer.b -= self.lr * self.vb

In [276]:
class Adam:
  def __init__(self, learning_rate, beta1, beta2):
    self.lr = learning_rate
    self.beta1 = beta1
    self.beta2 = beta2
    self.t = 1

  def optimize(self, dw, db, layer):
    if not hasattr(self, 'vw') or self.vw.shape != dw.shape:
        self.vw = torch.zeros_like(dw)
        self.vb = torch.zeros_like(db)
        self.sw = torch.zeros_like(dw)
        self.sb = torch.zeros_like(db)

    self.vw = (self.beta1 * self.vw) + ((1 - self.beta1) * dw)
    self.vb = (self.beta1 * self.vb) + ((1 - self.beta1) * db)
    self.sw = (self.beta2 * self.sw) + ((1 - self.beta2) * dw * dw)
    self.sb = (self.beta2 * self.sb) + ((1 - self.beta2) * db * db)

    self.vw = self.vw/(1-pow(self.beta1,self.t))
    self.vb = self.vb/(1-pow(self.beta1,self.t))
    self.sw = self.sw/(1-pow(self.beta2,self.t))
    self.sb = self.sb/(1-pow(self.beta2,self.t))


    layer.w -= self.lr * (self.vw/(torch.sqrt(self.sw) + 1e-3))
    layer.b -= self.lr * (self.vb/(torch.sqrt(self.sb) + 1e-3))

    self.t += 1


In [277]:
def step_decay(epoch, initial_lr=1e-3):
    if epoch < 3:
        return initial_lr * min(1.0, float(epoch + 1) / 5)
    elif epoch < 10:
        return initial_lr
    elif epoch < 15:
        return initial_lr * 0.5
    else:
        return initial_lr * 0.01

def cosine_decay(epoch, num_epochs, initial_lr):
  return .5 *initial_lr * (1 + cos((epoch * pi)/num_epochs))

def warmup(epoch, lr):
  if epoch < 5:
    return lr * (epoch + 1) / 5
  else:
    return lr

In [278]:
class FCNN:
  def __init__(self, device=None):
    self.l1 = Conv2D(3, 32, 3, Relu(), stride=3, device=device, padding=1)
    self.l2 = Conv2D(32, 64, 3, Relu(), stride=3, device=device, padding=1)
    self.l3 = Linear(1024, 256, Relu(), device=device)
    self.l4 = Linear(256, 10, lambda x:x, device=device)

    self.layers = [self.l1, self.l2, self.l3, self.l4]


  def forward(self, x):
    x = self.l1(x)
    x = self.l2(x)
    x = torch.flatten(x, 1)
    x = self.l3(x)
    x = self.l4(x)

    return x

  def __call__(self, x):
    return self.forward(x)






In [279]:
class NN:
  def __init__(self, conv_layers, fc_layers):
    self.layers = []
    for layer in conv_layers:
      #[in, out, kernal_size, stride, padding]
      self.layers.append(Conv2D(layer[0], layer[1], layer[2], Relu(), stride=layer[3], padding=layer[4]))
    for layer in fc_layers:
      #[input_size, output_size, dropout]
      self.layers.append(Linear(layer[0], layer[1], Relu(), dropout=layer[2]))
    self.layers[-1].activation = lambda x:x

  def forward(self, x):
    for layer in self.layers:
      if(type(layer)==Linear and x.dim()>2):
        x = torch.flatten(x, 1)
      x = layer(x)

    return x

  def __call__(self, x):
    return self.forward(x)

In [280]:
def save_model(model, name):
  with open(f"{name}.dill", "wb") as f:
    dill.dump(model, f)



In [281]:
def import_data():
  data = torchvision.datasets.CIFAR10(root="/content",download=True)

  xs = []
  labels = []
  for i in range(1,6):
    with open(f"/content/cifar-10-batches-py/data_batch_{i}", 'rb') as f:
      dict = pickle.load(f, encoding='bytes')
      xs.append(dict[b'data'])
      labels += dict[b'labels']

  labels = np.array(labels).reshape(-1,1)
  encoder = OneHotEncoder(sparse_output=False)
  y = encoder.fit_transform(labels)

  xs = np.array(xs)
  x = xs.reshape(50000, 3, 32, 32) / 255

  return x, y


def load_data(x, y, device=None):
  X_train, X_test, y_train, y_test = train_test_split(
      x, y, test_size=0.166, random_state=42)
  X_train = torch.tensor(X_train, dtype=torch.float32, device=device)
  y_train = torch.tensor(y_train, dtype=torch.float32, device=device)
  X_test = torch.tensor(X_test, dtype=torch.float32, device=device)
  y_test = torch.tensor(y_test, dtype=torch.float32, device=device)

  train_dataset = TensorDataset(X_train, y_train)
  test_dataset = TensorDataset(X_test, y_test)

  batch_size = 32
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

  return train_loader, test_loader

In [282]:
#@title train


def train(model, epochs, lr, loss_function, optimizer, name, train_loader, test_loader, decay_algo=None, use_wandb=True, save=True, device=None):
  softmax = Softmax(device=device)
  if use_wandb:
    wandb.init(project="ConvNet", entity="jacob-ashoo", name=name, config={
        "name" : name,
        "learning_rate":lr,
        "epochs":epochs,
        "optimizer" : type(optimizer).__name__,
        "beta1" : optimizer.beta1,
        "beta2" : optimizer.beta2 if type(optimizer)==Adam else 0,
        "l2_reg" : "l2" in name.lower(),
        "l" : loss_function.l if "l2" in name.lower() else 0,
        "dropout" : "dropout" in name.lower(),
        "decay_algo" : decay_algo
    })
  for epoch in range(epochs):
    losses = []
    n_total = 0
    n_correct = 0

    if decay_algo == "cosine":
      optimizer.lr = cosine_decay(epoch, epochs, lr)
    if decay_algo == "step":
      optimizer.lr = step_decay(epoch, lr)
    if(decay_algo=="warmup"):
      optimizer.lr = warmup(epoch, lr)

    for (iteration, (x,y)) in enumerate(train_loader):
      ypred = model(x)
      loss = loss_function(ypred, y)
      losses.append(loss)

      n_total += y.size(dim=0)
      ypred = softmax(ypred)
      guesses = torch.argmax(ypred, dim=1)
      truths = torch.argmax(y, dim=1)

      for i in range(len(guesses)):
        if(guesses[i]==truths[i]):
          n_correct += 1


      # Backpropagation
      grad = loss_function.backward(ypred, y)
      dx = grad

      for layer in reversed(model.layers):
        dx, dw, db = layer.backward(dx)
        optimizer.optimize(dw, db, layer)

    loss = sum(losses)/len(losses)
    accuracy = n_correct/n_total

    print(f"epoch {epoch} loss: {loss} accuracy: {accuracy}")
    if use_wandb:
      wandb.log({"train_loss":loss, "accuracy":accuracy})

  losses = []
  n_total = 0
  n_correct = 0
  for (iteration, (x,y)) in enumerate(test_loader):
      ypred = model(x)
      loss = loss_function(ypred, y)
      losses.append(loss)

      n_total += batch_size
      ypred = softmax(ypred)
      guesses = torch.argmax(ypred, dim=1)
      truths = torch.argmax(y, dim=1)

      for i in range(len(guesses)):
        if(guesses[i]==truths[i]):
          n_correct += 1

  loss = sum(losses)/len(losses)
  accuracy = n_correct/n_total
  if save:
    save_model(model, name)
  if use_wandb:
    wandb.log({"test_loss":loss, "test_accuracy":accuracy})
    wandb.save(f"{name}.dill")
    wandb.finish()



In [283]:
#Ablation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device}")

x, y = import_data()
train_loader, test_loader = load_data(x, y, device=device)


gd_lr = 5e-3
adam_lr = 1e-5
epochs = 30



model = FCNN(device=device)
train(model, epochs, gd_lr, CrossEntropy(device=device), Adam(adam_lr, .9, .999), "Adam_test", train_loader, test_loader, decay_algo="step", use_wandb=False, save=False, device=device)










Using cpu
Files already downloaded and verified
epoch 0 loss: 1.8685208559036255 accuracy: 0.33700239808153476
epoch 1 loss: 2.017793655395508 accuracy: 0.32016786570743405


KeyboardInterrupt: 