# **Second Model**
(Based on current understanding of task)

In [None]:
import torch, torchvision
from torch import nn
from torchvision import models

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
a=models.vgg16(weights=models.VGG16_Weights.DEFAULT)

In [None]:
print(a.features)

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [None]:
print(a.classifier)

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)


In [None]:
print(a.features[30])

MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)


In [None]:
print(a.classifier)

Sequential(
  (0): Identity()
)


In [None]:
b = torch.ones((2,3,224,224))
print(a(b).shape)

torch.Size([2, 25088])


## Dataset

In [None]:
# TODO

## AAAAAAAAAAAA

In [None]:
class Identity(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x

## Upper Branch

### simplified version

In [None]:
class GroundBranchSim(nn.Module):
  def __init__(self, use_seg=False):
    super().__init__()

    self.use_seg = use_seg

    self.vgg1 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)
    self.vgg2 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)

    # Freezing initial layers for finetuning
    for param1, param2 in zip(self.vgg1.features.parameters(), self.vgg2.features.parameters()):#, self.vgg1.features.parameters()):
      param1.requires_grad = False
      param2.requires_grad = False

    # If needed can modify size of final output...
    #self.vgg1.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
    #self.vgg2.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)

    # Modify size of input
    self.vgg1.features[0] = nn.Conv2d(
        1,
        64,
        kernel_size=3,
        padding=1
    )
    # Initiate weights
    nn.init.kaiming_normal_(self.vgg1.features[0].weight)

    if self.use_seg:
      # Feed Forward Network turns output of VGG into embedding # TODO: decide final size...
      self.FNN = nn.Sequential(
        nn.Linear(2000, 1024),
        nn.LayerNorm(1024),
        nn.ReLU(),

        nn.Linear(1024, 512)
      )
    else:
      # Feed Forward Network turns output of VGG into embedding # TODO: decide final size...
      self.FNN = nn.Sequential(
        nn.Linear(1000, 1024),
        nn.LayerNorm(1024),
        nn.ReLU(),

        nn.Linear(1024, 512)
      )

  def forward(self, ground_view, segmented_ground):
    x_ground = self.vgg2(ground_view)

    if self.use_seg:
      x_segmented = self.vgg3(segmented_ground)
      x = torch.cat((x_ground, x_segmented), dim=-1)
    else:
      x = x_ground

    x = self.FNN(x)

    return x

### amplified version

In [None]:
class GroundBranch(nn.Module):
  def __init__(self, use_seg=False):
    super().__init__()

    self.use_seg = use_seg

    self.vgg1 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)
    self.vgg2 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)

    # Freezing initial layers for finetuning
    for param1, param2 in zip(self.vgg1.features.parameters(), self.vgg2.features.parameters()):#, self.vgg1.features.parameters()):
      param1.requires_grad = False
      param2.requires_grad = False

    # Modify size of input
    self.vgg1.features[0] = nn.Conv2d(
        1,
        64,
        kernel_size=3,
        padding=1
    )
    # Initiate weights
    nn.init.kaiming_normal_(self.vgg1.features[0].weight)

    # If needed can modify size of final output...
    #self.vgg1.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
    #self.vgg2.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
    # Remove fully connected layer
    self.vgg1.classifier = nn.Sequential(Identity())
    self.vgg2.classifier = nn.Sequential(Identity())

    self.conv = nn.Conv2d(512,512,1)
    self.sigma = nn.Sigmoid()
    self.GAP = nn.AdaptiveAvgPool2d((1,1))

    if self.use_seg:
      # Feed Forward Network turns output of VGG into embedding # TODO: decide final size...
      self.FNN = nn.Sequential(
        nn.Linear(512, 1024),
        nn.LayerNorm(1024),
        nn.ReLU(),

        nn.Linear(1024, 512)
      )
    else:
      # Feed Forward Network turns output of VGG into embedding # TODO: decide final size...
      self.FNN = nn.Sequential(
        nn.Linear(1024, 1024),
        nn.LayerNorm(1024),
        nn.ReLU(),

        nn.Linear(1024, 512)
      )

  def forward(self, ground_view, segmented_ground):
    x_ground = self.vgg2(ground_view).view(-1,512,7,7)

    if self.use_seg:
      x_segmented = self.vgg3(segmented_ground).view(-1,512,7,7)

      mask = self.conv(x_segmented)
      x_ground *= mask

      x_ground = self.GAP(x_ground)
      x_ground = x_ground.view(x_ground.shape[0], -1) # return to Bx512
      x_aerial = self.GAP(x_aerial)
      x_aerial = x_aerial.view(x_aerial.shape[0], -1)

      x = torch.cat((x_ground, x_segmented), dim=-1) # Bx1024
    else:
      x = self.GAP(x_ground)
      x = x.view(x.shape[0], -1)

    x = self.FNN(x)

    return x

## Lower Branch

### simplified version

In [None]:
class AerialBranchSim(nn.Module):
  def __init__(self):
    super().__init__()

    self.vgg1 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)
    self.vgg2 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)

    # Freezing initial layers for finetuning
    for param1, param2 in zip(self.vgg1.features.parameters(), self.vgg2.features.parameters()):
      param1.requires_grad = False
      param2.requires_grad = False

    # Modify size of input
    self.vgg1.features[0] = nn.Conv2d(
        1,
        64,
        kernel_size=3,
        padding=1
    )
    # Initiate weights
    nn.init.kaiming_normal_(self.vgg1.features[0].weight)


    # If needed can modify size of final output...
    #self.vgg1.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
    #self.vgg2.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)

    # Feed Forward Network turns output of VGG into embedding # TODO: decide final size...
    self.FNN = nn.Sequential(
        nn.Linear(3000, 2048),
        nn.LayerNorm(2048),
        nn.ReLU(),

        nn.Linear(2048, 1024),
        nn.LayerNorm(1024),
        nn.ReLU(),

        nn.Linear(1024, 512)
    )

  def forward(self, ground_view, synthetic_aerial, segmented_aerial, candidate_aerial):
    #x_ground = self.vgg1(ground_view)
    x_segmented = self.vgg1(segmented_aerial)
    x_synthetic = self.vgg2(synthetic_aerial)
    x_candidate = self.vgg2(candidate_aerial)

    x = torch.cat((x_synthetic, x_segmented, x_candidate), dim=-1)
    x = self.FNN(x)

    return x

### amplified version

In [None]:
class AerialBranch(nn.Module):
  def __init__(self):
    super().__init__()

    self.vgg1 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)
    self.vgg2 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)

    # Freezing initial layers for finetuning
    for param1, param2 in zip(self.vgg1.features.parameters(), self.vgg2.features.parameters()):
      param1.requires_grad = False
      param2.requires_grad = False

    # Modify size of input
    self.vgg1.features[0] = nn.Conv2d(
        1,
        64,
        kernel_size=3,
        padding=1
    )
    # Initiate weights
    nn.init.kaiming_normal_(self.vgg1.features[0].weight)

    # If needed can modify size of final output...
    #self.vgg1.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
    #self.vgg2.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
    # Remove fully connected layer
    self.vgg1.classifier = nn.Sequential(Identity())
    self.vgg2.classifier = nn.Sequential(Identity())

    self.conv = nn.Conv2d(512,512,1)
    self.sigma = nn.Sigmoid()
    self.GAP = nn.AdaptiveAvgPool2d((1,1))

    # Feed Forward Network turns output of VGG into embedding # TODO: decide final size...
    self.FNN = nn.Sequential(
        nn.Linear(1536, 1024),
        nn.LayerNorm(1024),
        nn.ReLU(),

        nn.Linear(1024, 512)
    )

  def forward(self, ground_view, synthetic_aerial, segmented_aerial, candidate_aerial):
    #x_ground = self.vgg1(ground_view)
    x_segmented = self.vgg1(segmented_aerial).view(-1,512,7,7)
    x_synthetic = self.vgg2(synthetic_aerial).view(-1,512,7,7)
    x_candidate = self.vgg2(candidate_aerial).view(-1,512,7,7)

    mask = self.conv(x_segmented)

    x_candidate *= mask
    x_synthetic *= mask

    x_candidate = self.GAP(x_candidate)
    x_candidate = x_candidate.view(x_candidate.shape[0], -1)
    x_synthetic = self.GAP(x_synthetic)
    x_synthetic = x_synthetic.view(x_synthetic.shape[0], -1)
    x_segmented = self.GAP(x_segmented)
    x_segmented = x_segmented.view(x_segmented.shape[0], -1)

    x = torch.cat((x_synthetic, x_segmented, x_candidate), dim=-1)
    x = self.FNN(x)

    return x

## Complete Network

In [None]:
class CompNet(nn.Module):
  def __init__(self, ground_branch=GroundBranchSim(), aerial_branch=AerialBranchSim()):
    super().__init__()
    self.GB = ground_branch
    self.AB = aerial_branch

  def forward(self, ground_view, segmented_ground, synthetic_aerial, segmented_aerial, candidate_aerial):
    return self.GB(ground_view, segmented_ground), self.AB(synthetic_aerial, segmented_aerial, candidate_aerial)

## Triplet Loss

In [None]:
class WeightedSoftMarginTripletLoss(nn.Module):
  def __init__(self, margin=0.2):
    super().__init__()
    self.margin = margin

  def forward(self, anchor, positive, negatives):

    first = torch.norm(anchor - positive, dim=-1, keepdim=True)
    second = -torch.norm(anchor - negatives, dim=-1, keepdim=True)
    arg = self.margin * (second+first)
    const = torch.zeros((arg.shape[0],1))
    arg = torch.cat((const,arg), dim=-1)

    return torch.logsumexp(arg, dim=-1).mean()


### Training

In [None]:
def train_epoch(model, dataloader, optimizer, criterion=WeightedSoftMarginTripletLoss(), scheduler=None):
    model.train()
    tot_loss = 0.0

    for ground, aerial, segmented_ground, synthetic_aerial, segmented_aerial in dataloader:
      ground = ground.to(device)
      aerial = aerial.to(device)
      segmented_ground = segmented_ground.to(device)
      synthetic_aerial = synthetic_aerial.to(device)
      segmented_aerial =  segmented_aerial.to(device)

      # Forward pass
      optimizer.zero_grad()       # resets gradients from previous batch
      labels, predictions = model( ground, segmented_ground, synthetic_aerial, segmented_aerial, aerial)
      #print(aerial_pred.shape)

      batch_size = len(labels)
      batch_loss = 0.0

      for i in range(batch_size):
        indeces = range(batch_size)
        indeces.pop(i)
        anchor = labels[i]
        positive = predictions[i]
        negatives = predictions(indeces)

        # Sanity check
        if negatives.shape[0] != batch_size-1:
          print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")

        batch_loss += criterion(anchor, positive, negatives)
        #Backward pass
      batch_loss.backward()         # computes gradients via backpropagation
      optimizer.step()        # updates weights using gradients
      #scheduler.step()    # adjusts learning rate after each epoch


      tot_loss += batch_loss.item()/batch_size

    return tot_loss / len(dataloader)

def evaluate(model, dataloader, device, criterion=WeightedSoftMarginTripletLoss()):
  model.eval()
  total_loss = 0.0

  with torch.no_grad():
    tot_loss = 0.0

    for ground, aerial, segmented_ground, synthetic_aerial, segmented_aerial in dataloader:
      ground = ground.to(device)
      aerial = aerial.to(device)
      segmented_ground = segmented_ground.to(device)
      synthetic_aerial = synthetic_aerial.to(device)
      segmented_aerial =  segmented_aerial.to(device)

      # Forward pass
      labels, predictions = model( ground, segmented_ground, synthetic_aerial, segmented_aerial, aerial)
      #print(aerial_pred.shape)

      batch_size = len(labels)
      batch_loss = 0.0

      for i in range(batch_size):
        indeces = range(batch_size)
        indeces.pop(i)
        anchor = labels[i]
        positive = predictions[i]
        negatives = predictions(indeces)

        # Sanity check
        if negatives.shape[0] != batch_size-1:
          print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")

        batch_loss += criterion(anchor, positive, negatives)

      tot_loss += batch_loss.item()/batch_size

    return tot_loss / len(dataloader)

In [None]:
model = CompNet()

In [None]:
# Main training
num_epochs = 1
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_loss = evaluate(model, val_loader, device)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"\tTrain Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    # Save checkpoint
    if (epoch+1) % 5 == 0:
        torch.save(model.state_dict(), f"model_epoch_{epoch+1}.pth")