In [7]:
#!pip install torch
#!pip install Pillow
#!pip install ipyvolume

Collecting ipyvolume
  Downloading ipyvolume-0.5.2-py2.py3-none-any.whl (2.9 MB)
Collecting ipywebrtc
  Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)
Collecting pythreejs>=1.0.0
  Downloading pythreejs-2.3.0-py2.py3-none-any.whl (3.4 MB)
Collecting traittypes
  Downloading traittypes-0.2.1-py2.py3-none-any.whl (8.6 kB)
Collecting ipydatawidgets>=1.1.1
  Downloading ipydatawidgets-4.2.0-py2.py3-none-any.whl (275 kB)
Installing collected packages: ipywebrtc, traittypes, ipydatawidgets, pythreejs, ipyvolume
Successfully installed ipydatawidgets-4.2.0 ipyvolume-0.5.2 ipywebrtc-0.6.0 pythreejs-2.3.0 traittypes-0.2.1


In [67]:
import copy
import numpy
import random
import torch
from torch import nn

In [10]:
%run camera.py
%run rotation.py

In [2]:
if not torch.cuda.is_available():
    print("OI!  CUDA ISN'T RUNNING FOR SOME REASON!")
    device = "cpu"
else:
    device = "cuda"

In [78]:
class CameraNet(nn.Module):
    def __init__(self):
        super(CameraNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear((2+2)*9, 512),  # NINE PAIRS OF x -> x' points!
            nn.LeakyReLU(),
            nn.Linear(512, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 3+3+5) # 3-translation, 3-rotation, focalx, focaly, prin-x, prin-y, skew.
        )
    
    def forward(self, x):
        # Expect X to be pair-wise x1, y1, x'1, y'1 world -> projection
        # We expect there to be NINE pairs of corresponding points.
        return self.net(x)
    
    def save_layers(self, filename: str):
        input_weight, input_bias = self.net[0].parameters()
        hidden_0_weight, hidden_0_bias = self.net[2].parameters()
        hidden_1_weight, hidden_1_bias = self.net[4].parameters()
        output_weight, output_bias = self.net[6].parameters()
        # Convert all to numpy matrices for storage.
        weights_and_biases = list()
        for mat in [input_weight, input_bias, hidden_0_weight, hidden_0_bias, hidden_1_weight, hidden_1_bias, output_weight, output_bias]:
            weights_and_biases.append(mat.detach().cpu().numpy())
        numpy.savez_compressed(
            filename,
            weights_and_biases
            #input_weight=input_weight,
            #input_bias=input_bias,
            #hidden_0_weight=hidden_0_weight,
            #hidden_0_bias=hidden_0_bias,
            #hidden_1_weight=hidden_1_weight,
            #hidden_1_bias=hidden_1_bias,
            #output_weight=output_weight,
            #output_bias=output_bias
        )
        return weights_and_biases

In [None]:
# TODO:
def custom_loss(output, target):
    # We want to put more emphasis on recovering the extrinsics than the intrinsics.
    delta = torch.abs(output - target)
    intrinsic_loss = delta
    y[1] = focal_length_y
    y[2] = skew
    y[3] = principal_x
    y[4] = principal_y
    y[5] = x_rotation
    y[6] = y_rotation
    y[7] = z_rotation
    y[8] = x_translation
    y[9] = y_translation
    y[10] = z_translation

In [85]:
model = CameraNet().to(device)
loss_function = nn.MSELoss()  # Makes the most sense for distance errors, imho.
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [89]:
lattice = numpy.ones(shape=(9,4))  # This is actually 9x3, but the last is for homogeneous coordinates.
# These are not the points in 2d, these are the positions INSIDE the marker.
for i in range(0, 9):
    x = i%3
    y = i//3
    lattice[i, 0] = x
    lattice[i, 1] = y
    lattice[i, 2] = 0.0  # Z
    lattice[i, 3] = 1.0  # Homogeneous.

def build_example_pair():
    focal_length_x = random.uniform(0.01, 10.0)
    focal_length_y = focal_length_x * random.uniform(0.8, 1.2)
    skew = random.uniform(0, 0.1)
    principal_x, principal_y = random.choice([(1280, 720), (1920, 1080), (640, 480)])
    principal_x /= 2
    principal_y /= 2
    x_translation = random.uniform(-50, 50)
    y_translation = random.uniform(-50, 50)
    z_translation = random.uniform(-50, 50)
    x_rotation = random.uniform(-math.pi/2, math.pi/2)
    y_rotation = random.uniform(-math.pi/2, math.pi/2)
    z_rotation = random.uniform(-math.pi/2, math.pi/2)

    intrinsics = CameraIntrinsics(focal_length_x, focal_length_y, skew, principal_x, principal_y)
    extrinsics = CameraExtrinsics(x_rotation, y_rotation, z_rotation, x_translation, y_translation, z_translation)
    
    transformed = extrinsics.project_points(lattice, intrinsics, renormalize=True)
    
    x = numpy.zeros((36,), dtype=numpy.float32)
    y = numpy.zeros((11,), dtype=numpy.float32)
    
    # Set up projection tuples:
    # We randomize associations to avoid building a dependency on order.
    indices = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    random.shuffle(indices)
    for src_idx, dst_idx in enumerate(indices):
        # X, Y, X', Y'
        x[dst_idx*4] = lattice[src_idx, 0]
        x[dst_idx*4 + 1] = lattice[src_idx, 1]
        x[dst_idx*4 + 2] = transformed[src_idx, 0]
        x[dst_idx*4 + 3] = transformed[src_idx, 1]
    
    # Quantize X to integers to add noise.
    x = x.astype(numpy.uint8).astype(numpy.float32)

    # Set up camera predictions:
    y[0] = focal_length_x
    y[1] = focal_length_y
    y[2] = skew
    y[3] = principal_x
    y[4] = principal_y
    y[5] = x_rotation
    y[6] = y_rotation
    y[7] = z_rotation
    y[8] = x_translation
    y[9] = y_translation
    y[10] = z_translation
    
    return x, y

In [90]:
def train(model, loss_fn, optimizer, num_iterations=1000, batch_size=128):
    model.train()
    for i in range(num_iterations):
        #x, y = build_example_pair()
        x_batch = numpy.zeros((batch_size, 36), dtype=numpy.float32)
        y_batch = numpy.zeros((batch_size, 11), dtype=numpy.float32)
        for idx in range(batch_size):
            x, y = build_example_pair()
            x_batch[idx,:] = x[:]
            y_batch[idx,:] = y[:]
        x, y = torch.from_numpy(x_batch).to(device), torch.from_numpy(y_batch).to(device)

        # Compute prediction error
        pred = model(x)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    #print(f"loss: {loss.item():>7f}")
    return loss.item()

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [93]:
model = CameraNet().to(device)
loss_function = nn.MSELoss()  # Makes the most sense for distance errors, imho.
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)

best_loss = 1e10
historical_losses = list()

for t in range(1000):
    loss = train(model, loss_function, optimizer)
    historical_losses.append(loss)
    if loss < best_loss:
        best_loss = loss
        model.save_layers(f"iter_{t}_loss_{int(loss)}")
    print(f"{t}: {loss:>3f}")
model.save_layers("final")

0: 913.085815
1: 602.272461
2: 1360.816772
3: 1234.010376
4: 518.816040
5: 710.422180
6: 466.003510
7: 1116.679565
8: 2510.878906
9: 1952.854858
10: 431.340118
11: 1487.891479
12: 453.565857
13: 1467.904297
14: 1735.390991
15: 771.750977
16: 1804.822510
17: 1233.697144
18: 1534.927734
19: 1410.172485
20: 2544.081055
21: 2031.572510
22: 449.475342
23: 618.223633
24: 645.784912
25: 1851.733032
26: 445.902802
27: 1223.978394
28: 861.059937
29: 2058.491455
30: 1025.413330
31: 1232.384521
32: 1405.503540
33: 2020.744873
34: 1023.044922
35: 931.816589
36: 651.464539
37: 1141.786133
38: 1029.051147
39: 968.842163
40: 479.841400
41: 919.856567
42: 1028.425659
43: 364.604706
44: 2337.956299
45: 857.801758
46: 739.404968
47: 1644.986694
48: 2287.594482
49: 1209.849854
50: 794.485657
51: 664.121765
52: 2258.435059
53: 945.732117
54: 1599.849243
55: 1862.251465
56: 1357.222656
57: 703.610168
58: 1092.016357
59: 1985.474609
60: 950.203125
61: 610.878113
62: 922.114197
63: 731.283691
64: 2008.521240

[array([[ 0.14657713, -0.09319929,  0.02602725, ...,  0.07426018,
          0.00515713, -0.13067436],
        [-0.06463923,  0.1043667 ,  0.03774692, ...,  0.13743764,
         -0.14129257,  0.14124131],
        [ 0.03492866,  0.00348825, -0.05725351, ..., -0.07634484,
          0.00847011,  0.1569798 ],
        ...,
        [-0.01741213,  0.00904447,  0.14456052, ..., -0.06157595,
         -0.12437493, -0.09032364],
        [ 0.17021634, -0.11387151, -0.0895602 , ..., -0.13045137,
          0.02394859, -0.06348746],
        [-0.0403447 ,  0.09049027,  0.16960174, ...,  0.09472974,
         -0.1879608 ,  0.09682266]], dtype=float32),
 array([-3.82913947e-02,  1.49014920e-01, -1.22164756e-01, -7.53714442e-02,
         9.04488713e-02,  1.48225963e-01,  5.55563085e-02, -1.49181813e-01,
         1.63180500e-01,  8.75422433e-02,  9.64149460e-02, -6.46412671e-02,
        -1.31765723e-01,  8.65104720e-02, -6.32609427e-02, -4.81717288e-02,
        -1.87043156e-02,  9.47600827e-02, -8.91657174e