### Lesson 1: The spelled-out intro to neural networks and backpropagation: building micrograd

[Youtube video lecture](https://www.youtube.com/watch?v=VMj-3S1tku0)  
[micrograd repo](https://github.com/karpathy/micrograd)  
[Juputer notebook files](https://github.com/karpathy/nn-zero-to-hero/tree/master/lectures/micrograd)

Video upload date: Aug 17, 2022 (length: 2 hr 25 min)  
Watched on: Nov 12 and 13, 2025  
Reproduced on: Nov 17, 2025  

Content:
- Backpropagation

In [140]:
import math

In [141]:
class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data+other.data, (self, other), '+')

        def _backward():
            # z = x + y
            # dz / dx = 1, dz / dy = 1
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward
        return out
    
    def __radd__(self, other):
        return self + other
    
    def __neg__(self, other):
        return self * (-1)
    
    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return self + (-other)

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        
        def _backward():
            # z = x*y
            # dz / dx = y, dz / dy = x
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        return out
    
    def __rmul__(self, other):
        return self * other

    def tanh(self):
        out_data = (math.exp(self.data) - math.exp(-self.data)) / (math.exp(self.data) + math.exp(-self.data))
        # out_data = (math.exp(2*self.data) - 1) / (math.exp(2*self.data) + 1)
        out = Value(out_data, (self,), 'tanh')

        def _backward():
            # z = tanh(x)
            # dz / dx = 1 - (tanh(x))^2
            self.grad += (1 - out_data ** 2) * out.grad
        out._backward = _backward
        return out
    
    def __pow__(self, power):
        out = Value(self.data ** power, (self, ), f'**{power}')

        def _backward():
            # z = x ** a 
            # dz / dx = a * (x ** (a-1))
            self.grad += power * (self.data ** (power - 1)) * out.grad
        out._backward = _backward
        return out

    def backward(self):
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)

        self.grad = 1.0

        for node in reversed(topo):
            node._backward()

In [142]:
a = Value(2.0, label='a')
# 1-a
a - 3

Value(data=-1.0)

In [143]:
a = Value(2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
e = a * b      ; e.label = 'e'
d = e + c      ; d.label = 'd'
f = Value(-2.0, label='f')
L = d * f      ; L.label = 'L'

print("L =", L.data)
L.backward()

print("a.grad =", a.grad)
print("b.grad =", b.grad)
print("c.grad =", c.grad)
print("f.grad =", f.grad)

L = -8.0
a.grad = 6.0
b.grad = -4.0
c.grad = -2.0
f.grad = 4.0


In [144]:
def forward(a_val, b_val, c_val, f_val):
    a = Value(a_val)
    b = Value(b_val)
    c = Value(c_val)
    f = Value(f_val)
    return (a * b + c) * f

In [145]:
h = 1e-4
L1 = forward(2.0, -3.0, 10.0, -2.0).data
L2 = forward(2.0, -3.0 + h, 10.0, -2.0).data
numeric_grad_b = (L2 - L1) / h
numeric_grad_b

-4.000000000008441

In [146]:
a = Value(2.0)
b = Value(-3.0)
c = Value(10.0)
f = Value(-2.0)
L = (a * b + c) * f
L.backward()
b.grad

-4.0

In [147]:
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')

w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')

b = Value(6.8813735870195432, label='b')

x1w1 = x1 * w1; x1w1.label = 'x1*w1'
x2w2 = x2 * w2; x2w2.label = 'x2*w2'
s = x1w1 + x2w2; s.label = 'x1*w1 + x2*w2'
n = s + b; n.label = 'n'

o = n.tanh(); o.label = 'o'
print(o.data)

0.7071067811865477


In [148]:
o.backward()

print("x1.grad =", x1.grad)
print("x2.grad =", x2.grad)
print("w1.grad =", w1.grad)
print("w2.grad =", w2.grad)
print("b.grad  =", b.grad)

x1.grad = -1.4999999999999993
x2.grad = 0.4999999999999998
w1.grad = 0.9999999999999996
w2.grad = 0.0
b.grad  = 0.4999999999999998


In [149]:
# build a Neuron using Value
import random 

class Neuron:
    '''mathmatically: out = tanh(w1*x1 = w2*x2 + ... + b)'''
    def __init__(self, nin, nonlin=True):
        # nin = number of input
        self.w = [Value(random.uniform(-1, 1), label=f'w{i}') for i in range(nin)]
        self.b = Value(random.uniform(-1, 1), label='b')
        self.nonlin = nonlin
    
    def __call__(self, x):
        # x is a list of Values, length nin
        # weighted sum: w * x + b
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        return act.tanh() if self.nonlin else act
    
    def parameters(self):
        return self.w + [self.b]

In [150]:
# Layer: stack of neurons
class Layer:
    def __init__(self, nin, nout, **kwargs):
        # nout neurons, each with nin inputs
        self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]

    def __call__(self, x):
        # x: list[Value], shape=nin
        # output: list[Value], shape=nout
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs)==1 else outs
    
    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

In [151]:
# MLP: stack of layers
class MLP:
    def __init__(self, nin, nouts):
        # nouts is a list, e.g., [4, 4, 1]
        sz = [nin] + nouts
        self.layers = []
        for i in range(len(nouts)):
            # last layer: linear (no tanh), others: tanh
            nonlin = (i != len(nouts) - 1)
            layer = Layer(sz[i], sz[i+1], nonlin=nonlin)
            self.layers.append(layer)

    def __call__(self, x):
        # x: list[Value] of length nin
        for layer in self.layers:
            x = layer(x) # each layer consumes and produces list[Value] (or single Value)
            if isinstance(x, Value):
                x = [x] # normalize to list for next layer if needed
            # return last layer's output; if single Value, unwrap it
        return x[0] if len(x)==1 else x
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [152]:
# toy dataset: 4 samples, 3 features each
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0]  # target labels

In [153]:
mlp = MLP(3, [4, 4, 1])  # 3 inputs → 4 → 4 → 1
len(mlp.parameters())    # just to see how many parameters we have

41

In [154]:
# single forward pass to inspect
y_preds = []
for x in xs:
    x_vals = [Value(v) for v in x]
    y_pred = mlp(x_vals)
    y_preds.append(y_pred)

for i, (yp, y) in enumerate(zip(y_preds, ys)):
    print(i, "pred:", yp.data, "target:", y)

0 pred: 0.661265132870346 target: 1.0
1 pred: 0.42390507279389184 target: -1.0
2 pred: 0.36570337269176856 target: -1.0
3 pred: 0.47450992689917526 target: 1.0


In [155]:
# use mean square error(MSE): loss = (1/N) * sum(pred_i - y_i)^2
# compute loss
loss = sum((y_pred - y_target) ** 2
           for y_pred, y_target in zip(y_preds, ys))
loss = loss * (1.0 / len(ys))

print("loss before backward:", loss.data)

# backprop
# 1) zero gradients
# IMPORTANT: 
# Our autograd accumulates gradients: self.grad += ....
# If you call .backward() multiple times without resetting, grads will add up across steps.
# Frameworks like PyTorch also require optimizer.zero_grad() each iteration.
for p in mlp.parameters():
    p.grad = 0.0

# 2) backward from loss
loss.backward()

# 3) inspect some gradients
print("Example param grad:", mlp.parameters()[0].grad)

loss before backward: 1.070883121411676
Example param grad: -0.06489143265599923


In [156]:
learning_rate = 0.05
for p in mlp.parameters():
    p.data += -learning_rate * p.grad

In [157]:
mlp = MLP(3, [4, 4, 1])

# toy dataset: 4 samples, 3 features each
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0]  # target labels

for step in range(50):  # try 50 iterations
    # --- forward ---
    y_preds = []
    for x in xs:
        x_vals = [Value(v) for v in x]
        y_pred = mlp(x_vals)
        y_preds.append(y_pred)
    loss = sum((y_pred - y_target) ** 2
               for y_pred, y_target in zip(y_preds, ys))
    loss = loss * (1.0 / len(ys))

    # --- backward ---
    for p in mlp.parameters():
        p.grad = 0.0
    loss.backward()

    # --- update ---
    learning_rate = 0.1
    for p in mlp.parameters():
        p.data += -learning_rate * p.grad

    if step % 10 == 0:
        print(f"step {step}: loss = {loss.data}")

step 0: loss = 1.6339807835771145
step 10: loss = 0.6513930830353224
step 20: loss = 0.006127195122047765
step 30: loss = 0.0011804985627266121
step 40: loss = 0.0006520176885363108


In [158]:
print("Final predictions:")
for x, y in zip(xs, ys):
    x_vals = [Value(v) for v in x]
    y_pred = mlp(x_vals)
    print(x, "→", y_pred.data, "(target:", y, ")")

Final predictions:
[2.0, 3.0, -1.0] → 0.9723864094929136 (target: 1.0 )
[3.0, -1.0, 0.5] → -1.0018202373100626 (target: -1.0 )
[0.5, 1.0, 1.0] → -0.9962792565049707 (target: -1.0 )
[1.0, 1.0, -1.0] → 1.0271177500904705 (target: 1.0 )
