In [None]:
from sklearn.datasets import make_moons
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import numpy as np

In [None]:
X, y = make_moons(noise=0.25, n_samples=100, random_state=0)

In [None]:
plt.scatter(*X[y==0].T)
plt.scatter(*X[y==1].T)

# With sklearn

In [None]:
model = MLPClassifier().fit(X, y)

In [None]:
model = MLPClassifier(
    hidden_layer_sizes=(32,), solver="sgd", batch_size=len(X), learning_rate_init=0.5
).fit(X, y)

In [None]:
plt.plot(model.loss_curve_)

In [None]:
def visualize_classifier(predict, xmin, xmax, ymin, ymax, **kwargs):
    xx, yy = np.meshgrid(
        np.linspace(xmin, xmax, 100),
        np.linspace(ymin, ymax, 100),
    )
    X = np.stack([xx, yy], axis=-1).reshape(-1, 2)
    zz = predict(X).reshape(xx.shape)
    plt.contourf(xx, yy, zz, levels=100, **kwargs)

In [None]:
visualize_classifier(lambda x: model.predict_proba(x)[:, 1], -2, 2.5, -2, 2.5, cmap="RdBu")
plt.scatter(*X[y==0].T, color="red")
plt.scatter(*X[y==1].T, color="blue")

# With pytorch

In [None]:
import torch
from torch import nn

In [None]:
X, y = torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [None]:
neurons = 32
model = nn.Sequential(nn.Linear(2, neurons), nn.ReLU(), nn.Linear(neurons, 1), nn.Sigmoid())
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)

In [None]:
model

In [None]:
history = []
for i in range(1000):
    optimizer.zero_grad()
    y_pred = model(X).squeeze(1)
    loss = nn.functional.binary_cross_entropy(y_pred, y)
    loss.backward()
    history.append(loss.detach().numpy())
    optimizer.step()

In [None]:
plt.plot(history)

In [None]:
with torch.no_grad():
    visualize_classifier(
        lambda x: model(torch.tensor(x, dtype=torch.float32)).squeeze().numpy(),
        -2, 2.5, -2, 2.5,
        cmap="RdBu"
    )
plt.scatter(*X[y==0].T, color="red")
plt.scatter(*X[y==1].T, color="blue")

# With torch (manual optimizer)

need to talk about backprop and vjp

In [None]:
x = torch.linspace(0, 2*np.pi, 100, requires_grad=True)

In [None]:
yy = torch.sin(x)

In [None]:
yy.backward(torch.ones_like(yy))

In [None]:
plt.plot(x.detach(), yy.detach())
plt.plot(x.detach(), x.grad)

In [None]:
model = nn.Sequential(nn.Linear(2, neurons), nn.ReLU(), nn.Linear(neurons, 1), nn.Sigmoid())

In [None]:
def step(lr=1):
    with torch.no_grad():
        for par in model.parameters():
            par.add_(-lr * par.grad)

In [None]:
history = []
for i in range(1000):
    model.zero_grad()
    y_pred = model(X).squeeze(1)
    loss = nn.functional.binary_cross_entropy(y_pred, y)
    loss.backward()
    step()
    history.append(loss.detach().item())

In [None]:
plt.plot(history)

In [None]:
with torch.no_grad():
    visualize_classifier(lambda x: model(torch.from_numpy(x).float()).squeeze(1), -2, 2.5, -2, 2.5, cmap="RdBu")
plt.scatter(*X[y==0].T, color="red")
plt.scatter(*X[y==1].T, color="blue")

# With torch (manual backpropagation)

to see this a bit better we also implement the model step by step

In [None]:
w1 = (torch.randn(2, neurons) * 0.1).requires_grad_()

In [None]:
z1 = X @ w1; z1.retain_grad()
z1.shape

In [None]:
a1 = z1.relu(); a1.retain_grad()

In [None]:
w2 = (torch.randn(neurons, 1) * 0.1).requires_grad_()

In [None]:
z2 = a1 @ w2; z2.retain_grad()
z2.shape

In [None]:
z2 = z2.squeeze(1); z2.retain_grad()
z2.shape

In [None]:
loss = torch.mean((y - z2) ** 2)
loss

In [None]:
loss.backward()

e.g. `dz2` means gradient of loss wrt all components of `z2`

In [None]:
dz2 = - 2 / len(z2) * (y - z2)

In [None]:
(z2.grad == dz2).all()

... figure out what grad of matrix multiply is

In [None]:
dz2.shape, a1.shape, w2.shape

In [None]:
dz2.unsqueeze(1).shape

In [None]:
dw2 = a1.T @ dz2.unsqueeze(1)

In [None]:
dw2.shape

In [None]:
(dw2 == w2.grad).all()

In [None]:
da1 = dz2.unsqueeze(1) @ w2.T
da1.shape

In [None]:
(da1 == a1.grad).all()

what is the derivative of relu?

In [None]:
dz1 = (z1 > 0) * da1

In [None]:
(dz1 == z1.grad).all()

In [None]:
w1.shape, dz1.shape, X.shape

In [None]:
dw1 = X.T @ dz1
dw1.shape

In [None]:
(dw1 == w1.grad).all()

In [None]:
# initialize parameters
w1 = torch.randn(2, neurons) * 0.1
w2 = torch.randn(neurons, 1) * 0.1

# training loop
lr = 0.1
history = []
for i in range(100):
    # forward
    z1 = X @ w1
    a1 = torch.relu(z1)
    z2 = a1 @ w2
    z2 = z2.squeeze(1)
    loss = torch.mean((y - z2) ** 2)
    
    history.append(loss.item())

    # backward
    dz2 = - 2 / len(z2) * (y - z2)
    dw2 = a1.T @ dz2.unsqueeze(1)
    da1 = dz2.unsqueeze(1) @ w2.T
    dz1 = (z1 > 0) * da1
    dw1 = X.T @ dz1

    # gradient update
    for par, grad in [(w1, dw1), (w2, dw2)]:
        par.add_(-lr * grad)

In [None]:
plt.plot(history)

In [None]:
with torch.no_grad():
    visualize_classifier(
        lambda x: torch.relu(torch.from_numpy(x).float() @ w1) @ w2,
        -2, 2.5, -2, 2.5, cmap="RdBu"
    )
plt.scatter(*X[y==0].T, color="red")
plt.scatter(*X[y==1].T, color="blue")

implement bias and sigmoid activation

bias vjp: gradient of loss wrt to bias always one, so vjp is sum over incoming grad

In [None]:
# initialize parameters
w1 = torch.randn(2, neurons) * 0.1
w2 = torch.randn(neurons, 1) * 0.1
b1 = torch.randn(neurons) * 0.1
b2 = torch.randn(1) * 0.1

# training loop
lr = 0.1
history = []
for i in range(100):
    # forward
    z1 = X @ w1 + b1
    a1 = z1.relu()
    z2 = a1 @ w2 + b2
    z2 = z2.squeeze(1)
    loss = torch.mean((y - z2) ** 2)
    
    history.append(loss.item())

    # backward
    dz2 = - 2 / len(z2) * (y - z2)
    dw2 = a1.T @ dz2.unsqueeze(1)
    db2 = dz2.sum()
    da1 = dz2.unsqueeze(1) @ w2.T
    dz1 = (z1 > 0) * da1
    dw1 = X.T @ dz1
    db1 = dz1.sum() * torch.ones_like(b1)

    # gradient update
    for par, grad in [(w1, dw1), (w2, dw2), (b1, db1), (b2, db2)]:
        par.add_(-lr * grad)

In [None]:
plt.plot(history)

In [None]:
with torch.no_grad():
    visualize_classifier(
        lambda x: torch.relu(torch.from_numpy(x).float() @ w1 + b1) @ w2 + b2,
        -2, 2.5, -2, 2.5, cmap="RdBu"
    )
plt.scatter(*X[y==0].T, color="red")
plt.scatter(*X[y==1].T, color="blue")

better, but still not great, let's go for sigmoid and binary crossentropy

need to talk about the grads for these

In [None]:
# initialize parameters
w1 = torch.randn(2, neurons) * 0.1
w2 = torch.randn(neurons, 1) * 0.1
b1 = torch.randn(neurons) * 0.1
b2 = torch.randn(1) * 0.1

# training loop
lr = 1.0
eps = 1e-8 # constant to avoid division by/log of zero
history = []
for i in range(1000):
    # forward
    z1 = X @ w1 + b1
    a1 = torch.relu(z1)
    z2 = a1 @ w2 + b2
    a2 = torch.sigmoid(z2.squeeze(1))
    loss = -torch.mean(y * torch.log(a2 + eps) + (1 - y) * torch.log(1 - a2 + eps))

    history.append(loss.detach().item())

    # backward
    da2 = -1 / len(a2) * (y / (a2 + eps) - (1 - y) / (1 - a2 + eps))
    dz2 = (da2 * a2 * (1 - a2)).reshape(z2.shape)
    dw2 = a1.T @ dz2
    db2 = dz2.sum(axis=0)
    da1 = dz2 @ w2.T
    dz1 = (z1 > 0) * da1
    dw1 = X.T @ dz1
    db1 = dz1.sum(axis=0) * torch.ones_like(b1)

    # gradient update
    for par, grad in [(w1, dw1), (w2, dw2), (b1, db1), (b2, db2)]:
        par.add_(-lr * grad)

In [None]:
plt.plot(history)

In [None]:
with torch.no_grad():
    visualize_classifier(
        lambda x: ((torch.from_numpy(x).float() @ w1 + b1).relu() @ w2 + b2).sigmoid(),
        -2, 2.5, -2, 2.5, cmap="RdBu"
    )
plt.scatter(*X[y==0].T, color="red")
plt.scatter(*X[y==1].T, color="blue")