In [None]:
import math
import random
import requests
import time

import numpy as np
import torch
import torchvision

import matplotlib.pyplot as plt
import torch.nn.functional as F

from io import BytesIO
from PIL import Image

# Lakota AI Code Camp Lesson 10: Introduction to Neural Networks 0

## XOR Problem

XOR is a fundamental computer gate.
It takes two binary inputs and outputs a binary value.

|Input 1 | Input 2 | XOR Value |
|--------|---------|-----------|
| 1 | 1 | 0 |
| 1 | 0 | 1 |
| 0 | 1 | 1 |
| 0 | 0 | 0 |

In 1969, Marvin L. Minsky and Seymour A. Papert wrote a book called *Perceptrons* that looked at the XOR problem for single layer neural networks (also called a perceptron).
What they showed was that a perceptron could not learn the XOR function.

We're going to make a small neural network.
It's going to have two "neurons" and take in a binary vector of dimension 2.

First, we have to initialize our values and we do this by drawing each of our parameters from what's called a normal distribution.


In [None]:
a = random.gauss(mu=0.0, sigma=1.0)
b = random.gauss(mu=0.0, sigma=1.0)
c = random.gauss(mu=0.0, sigma=1.0)

We need to determine how big our steps are in training and how long we're going to train.

In [None]:
num_epochs = 1000000
lr = 1e-4

Now we train:

In [None]:
for e in range(num_epochs):
    # forward step
    x1 = random.randint(0, 1)
    x2 = random.randint(0, 1)
    xor = x1 ^ x2
    y = a * x1 + b * x2 + c
    exp_y = math.exp(-y)
    pred = 1 / (1 + exp_y)

    loss = (xor - 1) * y - math.log(1 + exp_y)
    #loss = math.pow(xor - pred, 2)

    if e % 100000 == 99999:
        print(f"Loss at epoch {e + 1}: {loss}")

    # gradient calculation
    da = (xor - 1 + exp_y / (1 + exp_y)) * x1
    db = (xor - 1 + exp_y / (1 + exp_y)) * x2
    dc = xor - 1 + exp_y / (1 + exp_y)

    a -= lr * da
    b -= lr * db
    c -= lr * dc

Loss at epoch 100000: -1.123947260950331e-05
Loss at epoch 200000: -15.161469792340748
Loss at epoch 300000: -2.1316282072803006e-14
Loss at epoch 400000: -30.157562160683632
Loss at epoch 500000: -4.359179683888215e-12
Loss at epoch 600000: -2.842170943040401e-14
Loss at epoch 700000: -52.63106216046873
Loss at epoch 800000: 0.0
Loss at epoch 900000: 0.0
Loss at epoch 1000000: 0.0


In [None]:
a, b, c

(-26.23662682495234, -23.95919608441167, -51.14066607653695)

Now, let's see how well our function does.

In [None]:
x1 = [1, 1, 0, 0]
x2 = [1, 0, 1, 0]

print(f"Input 1 | Input 2 | XOR")
print(23 * "-")

for x, y in zip(x1, x2):
    print(f"{x:7} | {y:7} | {x ^ y:3}")


Input 1 | Input 2 | XOR
-----------------------
      1 |       1 |   0
      1 |       0 |   1
      0 |       1 |   1
      0 |       0 |   0


In [None]:
print(f"XOR | Prediction")
for x, y in zip(x1, x2):
    pred = 1 / (1 + math.exp(x * a + y * b + c))
    print(f"{x ^ y:3} | {pred:10e}")

XOR | Prediction
  0 | 1.000000e+00
  1 | 1.000000e+00
  1 | 1.000000e+00
  0 | 1.000000e+00


Do you notice anything?

Since that didn't work, we're going to make a slightly more complicated neural network.
It'll be a two layer neural network.

First, we have to initialize.

In [None]:
def initialize(mu=0, sigma=1.0):
    layer1 = [random.gauss(mu=mu, sigma=sigma), random.gauss(mu=mu, sigma=sigma),
              random.gauss(mu=mu, sigma=sigma), random.gauss(mu=mu, sigma=sigma),
              0, 0
              ]

    layer2 = [random.gauss(mu=mu, sigma=sigma), random.gauss(mu=mu, sigma=sigma), 0]

    return layer1, layer2

In [None]:
layer1, layer2 = initialize()

We're going to define a different activation function:

In [None]:
def relu(x):
    return x if x > 0 else 0

We're going to define a helper function that gives us a prediction:

In [None]:
def forward(layer1, layer2, x):
    x1, x2 = x
    y1 = layer1[0] * x1 + layer1[1] * x2 + layer1[4]
    y2 = layer1[2] * x1 + layer1[3] * x2 + layer1[5]

    z1 = relu(y1)
    z2 = relu(y2)

    out = layer2[0] * z1 + layer2[1] * z2 + layer2[2]

    return 1 / (1 + math.exp(-1 * out))

We're going to define a train function.

In [None]:
def train(layer1, layer2, x, actual, lr, eps=1e-4):

    x1, x2 = x
    y1 = layer1[0] * x1 + layer1[1] * x2 + layer1[4]
    y2 = layer1[2] * x1 + layer1[3] * x2 + layer1[5]

    z1 = relu(y1)
    z2 = relu(y2)

    out = layer2[0] * z1 + layer2[1] * z2 + layer2[2]

    pred = 1 / (1 + math.exp(-out))

    if pred == 1.0:
        pred -= eps
    #loss = math.pow(actual - pred, 2)

    loss = -actual * math.log(pred) - (1 - actual) * math.log(1 - pred)

    #dloss = -2 * (actual - pred)
    dloss = (pred - actual) #/ (pred * (1 - pred))
    #dpred = pred * (1 - pred)

    dout1 = layer2[0]
    dout2 = layer2[1]

    dz1 = 1 if z1 > 0 else 0
    dz2 = 1 if z2 > 0 else 0

    dy10 = x1
    dy11 = x2
    dy12 = x1
    dy13 = x2

    layer2[0] -= lr * dloss * z1 # deleted dpred on all
    layer2[1] -= lr * dloss * z2
    layer2[2] -= lr * dloss

    layer1[0] -= lr * dloss * dout1 * dz1 * dy10
    layer1[1] -= lr * dloss * dout1 * dz1 * dy11
    layer1[2] -= lr * dloss * dout2 * dz2 * dy12
    layer1[3] -= lr * dloss * dout2 * dz2 * dy13
    layer1[4] -= lr * dloss * dout1 * dz1
    layer1[5] -= lr * dloss * dout2 * dz2

    return layer1, layer2, loss

How long we want to train and how far our steps are:

In [None]:
num_epochs = 100_000
lr = 1e-1

In [None]:
test_loss = 1
running_loss = 0

In [None]:
layer1, layer2 = initialize()

In [None]:
while test_loss > 0.1:
    layer1, layer2 = initialize()

    test_loss = 0

    for e in range(num_epochs):
        # forward step
        x1 = random.randint(0, 1)
        x2 = random.randint(0, 1)
        actual = x1 ^ x2
        x = [x1, x2]

        layer1, layer2, loss = train(layer1, layer2, x, actual, lr)

        running_loss += loss

        if e % (num_epochs // 10) == ((num_epochs // 10) - 1):
            running_loss /= (num_epochs // 10)
            test_loss = running_loss
            print(f"Loss at epoch {e + 1}: {running_loss}")
            running_loss = 0

Loss at epoch 10000: 0.18150946930839604
Loss at epoch 20000: 0.0013468871865798075
Loss at epoch 30000: 0.0007358036644605053
Loss at epoch 40000: 0.0005092385231366276
Loss at epoch 50000: 0.0003915583640164301
Loss at epoch 60000: 0.0003078076824204115
Loss at epoch 70000: 0.00026158743607763687
Loss at epoch 80000: 0.0002224250286844402
Loss at epoch 90000: 0.0001957796004063435
Loss at epoch 100000: 0.00017479327729565494


In [None]:
layer1

[4.6799630617102626,
 4.679992115182438,
 3.5345614024069403,
 3.53577051920848,
 -4.679801342666812,
 -0.00013595629654433863]

In [None]:
layer2

[-7.941923015772793, 4.906560869727129, -7.6254645920139685]

In [None]:
x1 = [1, 1, 0, 0]
x2 = [1, 0, 1, 0]

print(f"x | y | XOR | Prediction")
for x, y in zip(x1, x2):
    pred = forward(layer1, layer2, [x, y])
    print(f"{x} | {y} | {x ^ y:3} | {pred:10f}")

x | y | XOR | Prediction
1 | 1 |   0 |   0.000041
1 | 0 |   1 |   0.999940
0 | 1 |   1 |   0.999940
0 | 0 |   0 |   0.000488


In [None]:
x1 = [1, 1, 0, 0]
x2 = [1, 0, 1, 0]

print(f"XOR | Prediction")
for x, y in zip(x1, x2):
    pred = 1 if forward(layer1, layer2, [x, y]) > 0.5 else 0
    print(f"{x ^ y:3} | {pred:10}")

XOR | Prediction
  0 |          0
  1 |          1
  1 |          1
  0 |          0


Now we look at it in PyTorch:

In [None]:
class Model(torch.nn.Module):

    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(2, 2)
        self.fc2 = torch.nn.Linear(2, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.sigmoid(self.fc2(x))

In [None]:
model = Model()

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1, momentum=0.9)

In [None]:
#loss_fn = torch.nn.BCELoss()
loss_fn = torch.nn.MSELoss()

In [None]:
num_epochs = 10_000

In [None]:
for e in range(num_epochs):
    # forward step
    x1 = random.randint(0, 1)
    x2 = random.randint(0, 1)
    actual = torch.tensor([x1 ^ x2], dtype=torch.float)
    x = torch.tensor([x1, x2], dtype=torch.float)

    optimizer.zero_grad()

    pred = model(x)

    loss = loss_fn(pred, actual)

    loss.backward()

    optimizer.step()

    running_loss += loss

    if e % (num_epochs // 10) == ((num_epochs // 10) - 1):
        running_loss /= (num_epochs // 10)
        test_loss = running_loss
        print(f"Loss at epoch {e + 1}: {running_loss}")
        running_loss = 0

Loss at epoch 1000: 0.16700831055641174
Loss at epoch 2000: 0.17107121646404266
Loss at epoch 3000: 0.17033977806568146
Loss at epoch 4000: 0.17130155861377716
Loss at epoch 5000: 0.17885981500148773
Loss at epoch 6000: 0.17062394320964813
Loss at epoch 7000: 0.16937372088432312
Loss at epoch 8000: 0.17570529878139496
Loss at epoch 9000: 0.18374231457710266
Loss at epoch 10000: 0.1696285754442215


In [None]:
x1 = [1, 1, 0, 0]
x2 = [1, 0, 1, 0]

print(f"x | y | XOR | Prediction")
for x, y in zip(x1, x2):
    pred = model(torch.tensor([x, y], dtype=torch.float))
    print(f"{x} | {y} | {x ^ y:3} | {pred.item()}")

x | y | XOR | Prediction
1 | 1 |   0 | 0.4025195837020874
1 | 0 |   1 | 0.9999997615814209
0 | 1 |   1 | 0.4025195837020874
0 | 0 |   0 | 0.4025195837020874


In [None]:
list(model.fc1.parameters())

[Parameter containing:
 tensor([[-0.2303,  0.0951],
         [ 3.5690, -3.5791]], requires_grad=True),
 Parameter containing:
 tensor([-0.5658, -0.0024], requires_grad=True)]

In [None]:
list(model.fc2.parameters())

[Parameter containing:
 tensor([[0.3763, 4.4470]], requires_grad=True),
 Parameter containing:
 tensor([-1.7391], requires_grad=True)]

In [None]:
x1 = [1, 1, 0, 0]
x2 = [1, 0, 1, 0]

print(f"XOR | Prediction")
for x, y in zip(x1, x2):
    pred = 1 if model(torch.tensor([x, y], dtype=torch.float)).item() > 0.5 else 0
    print(f"{x ^ y:3} | {pred:10}")

XOR | Prediction
  0 |          0
  1 |          1
  1 |          0
  0 |          0
