In [1]:
import math
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
class Tensor:

  def __init__(self, value, label='', children=(), operator=None):
    self.value = value
    self.children = set(children)
    self.operator = operator
    self.grad = 0
    self._backward = lambda  : None
    self.label = label

  def __repr__(self) -> str:
    return f"Tensor(value={self.value})"

  def __mul__(self, other):
    other = other if isinstance(other, Tensor) else Tensor(other)
    
    out = Tensor(self.value*other.value, children= (self, other), operator='*')
    
    def backward():
      self.grad += other.value * out.grad
      other.grad += self.value * out.grad
    
    out._backward = backward
    
    return out

  def __add__(self, other):
    other = other if isinstance(other, Tensor) else Tensor(other)
    
    out = Tensor(self.value+other.value, children= (self, other), operator='+')
    
    def backward():
      self.grad += 1 * out.grad
      other.grad += 1 * out.grad
    
    out._backward = backward
    
    return out
  def __pow__(self, other):

    input_value = self.value
    output_value = pow(input_value, other)

    out = Tensor(output_value, children=(self,), operator='pow')

    def backward():
      self.grad = other * pow(self.value, other-1) * out.grad

    out._backward = backward

    return out
  
  def __sub__(self, other):
    return self + (-other)
  
  def __truediv__(self, other):
    other = other if isinstance(other, Tensor) else Tensor(other)
    if other.value == 0:
        raise ValueError("Division by zero")
    out = Tensor(self.value / other.value, children=(self, other), operator='/')

    def backward():
        self.grad += (1 / other.value) * out.grad
        other.grad += (-self.value / (other.value ** 2)) * out.grad

    out._backward = backward
    return out

  def backward(self):

    topo_sort = []
    visited = set()

    def build_topo(v):
      if v not in visited:
        visited.add(v)
        for child in v.children:
          build_topo(child)
        topo_sort.append(v)
    
    build_topo(self)

    self.grad = 1
    for v in reversed(topo_sort):
      v._backward()


  def __radd__(self, other):
    return self + other

In [3]:
class Neuron:

  def __init__(self, input_size):
    self.weights = [Tensor(1) for i in range(input_size)]
    self.bias = Tensor(1)

  def forward(self, x):
    res = sum([w_i * x_i for w_i, x_i in zip(self.weights, x)]) + self.bias
    res.value = 1 / (1 + math.exp(res.value))
    return res

  def __call__(self, x):
    return self.forward(x)

  def parameters(self) -> list[Tensor]:
    return self.weights + [self.bias]  

In [4]:
class Layer:

  def __init__(self, input_size, output_size):
    self.neurons = [Neuron(input_size) for _ in range(output_size)]

  def forward(self, x):
    out = [neuron(x) for neuron in self.neurons]
    return out[0] if len(out)==1 else out

  def __call__(self, x):
    return self.forward(x)
  
  def parameters(self) -> list[Tensor]:
    return[param for neuron in self.neurons for param in neuron.parameters()]

In [5]:
class MLP:

  def __init__(self, input_size, layer_sizes):
    layers_total = [input_size] + layer_sizes
    self.layers = [Layer(layers_total[i], layers_total[i+1]) for i in range(len(layer_sizes))]

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return x

  def __call__(self, x):
    return self.forward(x)

  def parameters(self) -> list[Tensor]:
    return[param for layer in self.layers for param in layer.parameters()]

In [6]:
input_size = 3
layer_sizes = [2, 3, 1]
model = MLP(input_size, layer_sizes)

In [7]:
def criterion(y_hat: list[Tensor], Y:list[int]) -> Tensor:

  return sum([ (y_hat-Y)**2 for y_hat, Y in zip(y_hat, Y)])/len(Y)

In [8]:
X = [[2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5], 
    [0.5, 1.0, 1.0], 
    [1.0, 1.0, -1.0]]
    
Y = [1.0, -1.0, -1.0, 1.0]


In [9]:
class Optimizer:
  def __init__(self, parameters: list[Tensor], lr: float):
    self.parameters = parameters
    self.lr = lr

  def zero_grad(self):
    for parameter in self.parameters:
      parameter.grad = 0

  def step(self):
    for parameter in self.parameters:
      parameter.value += self.lr * parameter.grad
    

In [10]:
optim = Optimizer(model.parameters(), 0.01)

In [11]:
n_epochs = 20

for i in range(n_epochs):
  y_hats = [model(x) for x in X]
  loss = criterion(y_hats, Y)

  optim.zero_grad()
  
  loss.backward()
  
  optim.step()
  print(f'epoch: {i + 1}, loss: {loss}')

epoch: 1, loss: Tensor(value=1.017091683001651)
epoch: 2, loss: Tensor(value=1.0161398719826054)
epoch: 3, loss: Tensor(value=1.0152021909648021)
epoch: 4, loss: Tensor(value=1.0142710818241834)
epoch: 5, loss: Tensor(value=1.0133395674449681)
epoch: 6, loss: Tensor(value=1.0124011773632082)
epoch: 7, loss: Tensor(value=1.0114498727156216)
epoch: 8, loss: Tensor(value=1.0104799729402296)
epoch: 9, loss: Tensor(value=1.009486086326592)
epoch: 10, loss: Tensor(value=1.008463046518633)
epoch: 11, loss: Tensor(value=1.0074058574026938)
epoch: 12, loss: Tensor(value=1.0063096494381938)
epoch: 13, loss: Tensor(value=1.005169651367922)
epoch: 14, loss: Tensor(value=1.0039811823125226)
epoch: 15, loss: Tensor(value=1.0027396703947955)
epoch: 16, loss: Tensor(value=1.0014407050673357)
epoch: 17, loss: Tensor(value=1.0000801309471685)
epoch: 18, loss: Tensor(value=0.9986541907915023)
epoch: 19, loss: Tensor(value=0.9971597237608154)
epoch: 20, loss: Tensor(value=0.9955944217155644)
