In [7]:
#!pip install torch
#!pip install Pillow
#!pip install ipyvolume

Collecting ipyvolume
  Downloading ipyvolume-0.5.2-py2.py3-none-any.whl (2.9 MB)
Collecting ipywebrtc
  Downloading ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)
Collecting pythreejs>=1.0.0
  Downloading pythreejs-2.3.0-py2.py3-none-any.whl (3.4 MB)
Collecting traittypes
  Downloading traittypes-0.2.1-py2.py3-none-any.whl (8.6 kB)
Collecting ipydatawidgets>=1.1.1
  Downloading ipydatawidgets-4.2.0-py2.py3-none-any.whl (275 kB)
Installing collected packages: ipywebrtc, traittypes, ipydatawidgets, pythreejs, ipyvolume
Successfully installed ipydatawidgets-4.2.0 ipyvolume-0.5.2 ipywebrtc-0.6.0 pythreejs-2.3.0 traittypes-0.2.1


In [67]:
import copy
import numpy
import random
import torch
from torch import nn

In [10]:
%run camera.py
%run rotation.py

In [2]:
if not torch.cuda.is_available():
    print("OI!  CUDA ISN'T RUNNING FOR SOME REASON!")
    device = "cpu"
else:
    device = "cuda"

In [78]:
class CameraNet(nn.Module):
    def __init__(self):
        super(CameraNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear((2+2)*9, 512),  # NINE PAIRS OF x -> x' points!
            nn.LeakyReLU(),
            nn.Linear(512, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 3+3+5) # 3-translation, 3-rotation, focalx, focaly, prin-x, prin-y, skew.
        )
    
    def forward(self, x):
        # Expect X to be pair-wise x1, y1, x'1, y'1 world -> projection
        # We expect there to be NINE pairs of corresponding points.
        return self.net(x)
    
    def save_layers(self, filename: str):
        input_weight, input_bias = self.net[0].parameters()
        hidden_0_weight, hidden_0_bias = self.net[2].parameters()
        hidden_1_weight, hidden_1_bias = self.net[4].parameters()
        output_weight, output_bias = self.net[6].parameters()
        # Convert all to numpy matrices for storage.
        weights_and_biases = list()
        for mat in [input_weight, input_bias, hidden_0_weight, hidden_0_bias, hidden_1_weight, hidden_1_bias, output_weight, output_bias]:
            weights_and_biases.append(mat.detach().cpu().numpy())
        numpy.savez_compressed(
            filename,
            weights_and_biases
            #input_weight=input_weight,
            #input_bias=input_bias,
            #hidden_0_weight=hidden_0_weight,
            #hidden_0_bias=hidden_0_bias,
            #hidden_1_weight=hidden_1_weight,
            #hidden_1_bias=hidden_1_bias,
            #output_weight=output_weight,
            #output_bias=output_bias
        )
        return weights_and_biases

In [85]:
model = CameraNet().to(device)
loss_function = nn.MSELoss()  # Makes the most sense for distance errors, imho.
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [86]:
lattice = numpy.ones(shape=(9,4))  # This is actually 9x3, but the last is for homogeneous coordinates.
# These are not the points in 2d, these are the positions INSIDE the marker.
for i in range(0, 9):
    x = i%3
    y = i//3
    lattice[i, 0] = x
    lattice[i, 1] = y
    lattice[i, 2] = 0.0  # Z
    lattice[i, 3] = 1.0  # Homogeneous.

def build_example_pair():
    focal_length_x = random.uniform(0.01, 10.0)
    focal_length_y = focal_length_x * random.uniform(0.8, 1.2)
    skew = random.uniform(0, 0.1)
    principal_x, principal_y = random.choice([(1280, 720), (1920, 1080), (640, 480)])
    principal_x /= 2
    principal_y /= 2
    x_translation = random.uniform(-50, 50)
    y_translation = random.uniform(-50, 50)
    z_translation = random.uniform(-50, 50)
    x_rotation = random.uniform(-math.pi/2, math.pi/2)
    y_rotation = random.uniform(-math.pi/2, math.pi/2)
    z_rotation = random.uniform(-math.pi/2, math.pi/2)

    intrinsics = CameraIntrinsics(focal_length_x, focal_length_y, skew, principal_x, principal_y)
    extrinsics = CameraExtrinsics(x_rotation, y_rotation, z_rotation, x_translation, y_translation, z_translation)
    
    transformed = extrinsics.project_points(lattice, intrinsics, renormalize=True)
    
    x = numpy.zeros((36,), dtype=numpy.float32)
    y = numpy.zeros((11,), dtype=numpy.float32)
    
    # Set up projection tuples:
    for idx in range(9):
        # X, Y, X', Y'
        x[idx*4] = lattice[idx, 0]
        x[idx*4 + 1] = lattice[idx, 1]
        x[idx*4 + 2] = transformed[idx, 0]
        x[idx*4 + 3] = transformed[idx, 1]
    
    # Quantize X to integers to add noise.
    x = x.astype(numpy.uint8).astype(numpy.float32)

    # Set up camera predictions:
    y[0] = focal_length_x
    y[1] = focal_length_y
    y[2] = skew
    y[3] = principal_x
    y[4] = principal_y
    y[5] = x_rotation
    y[6] = y_rotation
    y[7] = z_rotation
    y[8] = x_translation
    y[9] = y_translation
    y[10] = z_translation
    
    return x, y

In [87]:
def train(model, loss_fn, optimizer, num_iterations=100, batch_size=32):
    model.train()
    for i in range(num_iterations):
        #x, y = build_example_pair()
        x_batch = numpy.zeros((batch_size, 36), dtype=numpy.float32)
        y_batch = numpy.zeros((batch_size, 11), dtype=numpy.float32)
        for idx in range(batch_size):
            x, y = build_example_pair()
            x_batch[idx,:] = x[:]
            y_batch[idx,:] = y[:]
        x, y = torch.from_numpy(x_batch).to(device), torch.from_numpy(y_batch).to(device)

        # Compute prediction error
        pred = model(x)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    #print(f"loss: {loss.item():>7f}")
    return loss.item()

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [88]:
model = CameraNet().to(device)
loss_function = nn.MSELoss()  # Makes the most sense for distance errors, imho.
optimizer = torch.optim.SGD(model.parameters(), lr=1e-9)

best_loss = 1e10
historical_losses = list()

for t in range(1000):
    loss = train(model, loss_function, optimizer)
    historical_losses.append(loss)
    if loss < best_loss:
        best_loss = loss
        model.save_layers(f"iter_{t}_loss_{int(loss)}")
    if t%100 == 0:
        print(f"{t}: {loss:>3f}")
model.save_layers("final")

0: 51927.746094
100: 32960.132812
200: 10716.913086
300: 5048.697754
400: 3218.262939
500: 1707.892456
600: 1228.816040
700: 590.780090
800: 1348.567993
900: 836.955811


[array([[-0.00457262,  0.11078288,  0.03047206, ...,  0.15432335,
         -0.01277097,  0.09172449],
        [-0.02465741,  0.08140032, -0.11991473, ..., -0.15844125,
          0.05823422,  0.0514944 ],
        [ 0.08703952,  0.11497141,  0.15787224, ...,  0.06516586,
         -0.0882627 , -0.11792669],
        ...,
        [-0.14582965,  0.00396442, -0.09858604, ...,  0.12130584,
         -0.16073582, -0.08105814],
        [-0.12470108, -0.0416171 , -0.13449362, ..., -0.09015618,
         -0.0961545 , -0.16179651],
        [ 0.13368173, -0.11784005,  0.16386661, ...,  0.01633376,
         -0.03246503,  0.11596475]], dtype=float32),
 array([ 0.13286014, -0.00443654,  0.15505134,  0.05655713,  0.06233649,
         0.10812484, -0.08199704,  0.07525398,  0.09071808,  0.13037367,
        -0.00155896, -0.10574505,  0.05234814, -0.12441284, -0.08678089,
        -0.15455662,  0.04596509,  0.16246174, -0.09986725,  0.08356453,
         0.15762024, -0.06775191, -0.00663131,  0.07244875,  0.060