# Notebook Initialization

In [1]:
#@title
### Note: this stuff is also borrowed from Corner_detection.ipynb ###

#
# Import required libraries and helper functions
#

import os     # Used to sort files for file reading
from os import listdir
from os.path import isfile, join
import re     # Regex, used in the file reading

import cv2          # Open Computer Vision 2 - a must for any image manipulation
import imutils
import matplotlib.pyplot as plt

import numpy as np  # Other staples for working with images
import random
import copy
from skimage.util import random_noise

import torch
from torch import nn    # Pytorch
from torch import optim
from torch.utils.data import TensorDataset, DataLoader, sampler
from torchvision import transforms

import scipy.misc       # Don't remember what this is used for - may not be needed

import math

In [2]:
#@title
# Clone our labeled images stored in GitHub

!git clone https://github.com/Hunterdjensen/CS_6955_Project.git

fatal: destination path 'CS_6955_Project' already exists and is not an empty directory.


In [3]:
#@title
# Determine what type of device we are using

if torch.cuda.is_available():
  device = torch.device('cuda:0')
else:
  device = torch.device('cpu')

print('using device:', device)

using device: cuda:0


# General Helper Functions
Borrowed from Corner_detection.ipynb, with a few minor edits
to match the segmentation task

In [4]:
#
# Helper functions for parsing the images
#

def line_to_points(line):
    points_ascii = line.split("\t")[2]  # The 3rd column contains points data
    points = eval(points_ascii)         # Convert the string into a list
    return points


def get_num_cards(line):
    return int(line.split("\t")[1])


# Function for displaying BGR images (opencv defaults to reading images in BGR format)
def displayRGB(image, points=None):
    #Compatibility with (C, W, H) data
    if image.shape[0] == 3:
        image = np.moveaxis(image, 0, -1)
    temp_img = image.copy()
    if points is not None:
        for j, frame in enumerate(points):
            for i, point in enumerate(frame):
                cv2.line(temp_img, tuple(point), tuple(points[j][i - 1]), (255, 0, 0), thickness=2)
    plt.axis("off")
    # plt.imshow(cv2.cvtColor(temp_img, cv2.COLOR_BGR2RGB))
    plt.imshow(temp_img)
    plt.axis('off')
    plt.show()


# So that we can take various sized rectangular photos and make them all square:
def expand_image(img, points, size=225):
    height, width, _ = img.shape
    points_oob = 0

    # Check how many points are out of bounds
    if points is not None:
      for card in points:
        for point in card:
          if (point[0] < 0) or (point[0] >= width) or (point[1] < 0) or (point[1] >= height):
            points_oob += 1
    
    # If picture is too big, scaled it down to the size
    if (height > width):
        if (height > size):
            img = imutils.resize(img, height=size)
    else:
        if (width > size):
            img = imutils.resize(img, width=size)

    # Regrab the new height and width
    height, width, _ = img.shape

    # If no points were passed in, assume the corners of the image
    if points is None:
        points = []
        points.append([])
        points[0].append((0, height))
        points[0].append((0, 0))
        points[0].append((width, 0))
        points[0].append((width, height))


    # Add border (if the image isn't already square)
    left = int((size - width) / 2)  # So if image has width of 125, then left is 100/2=50
    right = int(size - (left+width))  # Remainder
    top = int((size - height) / 2)
    bottom = int(size - (top+height))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, (0, 0, 0))

    # Update points
    for i, card in enumerate(points):      # The first dim of 'points' is each card
        for j, point in enumerate(card):   # Second dimension is a list of 4 tuples (each corner point)
            points[i][j] = (point[0]+left, point[1]+top)    # Shift it to match the new image

    return (img, points, points_oob)


def read_images(max_samples=50000, cards_per_image=1, show_examples=False, show_image=None, max_corners_oob=0):
    my_path = 'CS_6955_Project/Examples'
    onlyfiles = [f for f in listdir(my_path) if isfile(join(my_path, f))]
    onlyfiles = sorted(onlyfiles)
    os.chdir(my_path)   # cd into the directory 'my_path'

    label_filename = 'classification_results.txt'
    label_file = open(label_filename, "a")
    label_file.close()

    x = np.array([], dtype=int)   # Numpy array holding each example image
    y = np.array([], dtype=int)   # Array holding the labels  (only supports exactly one card/img)

    i = 0
    for filename in onlyfiles:
        # Search the text file for the labels corresponding to this image
        found = False
        with open(label_filename) as file:
            for line in file:
                if filename in line:
                    num_cards = get_num_cards(line)
                    points = line_to_points(line)
                    found = True
                    break
        if found and re.match(r".*.jpe", filename) and (num_cards == cards_per_image):
            # Read in next photo
            img = cv2.imread(filename)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img, points, points_oob = expand_image(img, points)
            if (points_oob > max_corners_oob):
                continue  # If too many corners are out-of-bounds, skip this one
            x = np.concatenate((x, img[None,:,:,:]), axis=0) if x.size else img[None,:,:,:]
            y = np.concatenate((y, np.array(points)[None,:,:,:]), axis=0) if y.size else np.array(points)[None,:,:,:]
            if (show_examples and (i < 2)) or ((show_image is not None) and (show_image in filename)):
                displayRGB(img, points)
            if (y.shape[0] >= max_samples):
                break   # Exit once you have enough samples
            i += 1

    if x.size:
        x = np.moveaxis(x, -1, 1)   # Move the depth to the second position
    #y = np.squeeze(y, axis=1) # Temporarily remove the 1th dimension (which contains how many cards/image) so it matches coords from DSNT
    print("X shape: " + str(x.shape))    # Dimensions are: [example, depth, height, width]
    print("Y shape: " + str(y.shape))    # Dimensions are: [example, cards in image, corners, coordinate]
    
    os.chdir('../..')
    return x, y


def shuffle_two_arrays(x, y):
    rand_idxs = np.random.permutation(y.shape[0])
    return x[rand_idxs], y[rand_idxs]
  

def split_dataset(x, y):
    x, y = shuffle_two_arrays(x, y)
    num_examples = x.shape[0]
    p8 = round(num_examples * 0.8)
    p9 = round(num_examples * 0.9)
    x_train = x[:p8, :, :, :]
    try:
      y_train = y[:p8, :, :, :]
    except:   # If y only has 3 dimensions (when only 1 point, not 4)
      y_train = y[:p8, :, :]
    x_val = x[p8:p9, :, :, :]
    try:
      y_val = y[p8:p9, :, :, :]
    except:
      y_val = y[p8:p9, :, :]
    x_test = x[p9:, :, :, :]
    try:
      y_test = y[p9:, :, :, :]
    except:
      y_test = y[p9:, :, :]
    return x_train, y_train, x_val, y_val, x_test, y_test

In [5]:
os.getcwd()

'C:\\Users\\sean\\Documents\\GitHub\\Portfolio\\Pokemon Card Detection'

In [6]:
# x, y = read_images(max_samples=50, show_examples=True, cards_per_image=4)
# displayRGB(x[0, :, :, :], y[0, :, :, :])

# X = (N, C=3, W=225, H=225)
# Y = (N, num_cards, num_points=4, point_dim=2)

In [7]:
def numpy_normalize_rgb(x, show_examples=False):
  '''
    Takes an input x of size (N, 3, H, W) and returns x_norm of same shape
    Each RGB value will have the dataset mean subtracted from it, and divided
    by the std.
  '''
  x_t = x.transpose((0, 2, 3, 1))
  x_norm = (x_t - x_t.mean(axis=(0, 1, 2), keepdims=True)) / x_t.std(axis=(0, 1 ,2), keepdims=True)
  x_norm = np.clip(x_norm, 0, 1)    # Images are now in the [-1,2.5] range, clip to [0,1]
  x_norm = (x_norm * 255).astype('uint8')   # And convert back to [0,255] ints
  if show_examples:
    plt.subplot(1,2,1), plt.imshow(x_t[0])
    plt.subplot(1,2,2), plt.imshow(x_norm[0])
    plt.show()
  x_norm = x_norm.transpose((0, 3, 1, 2))
  return x_norm

# My Helper Functions
Things I created while making the segmentation net

In [8]:
#Converts a set of points to a segmentation mask for a single image
def points_to_segmentation(image, points, show=False):
  # image.shape = (C=3, W=225, H=225)
  # points.shape = (num_cards, num_points=4, point_dim=2)
  # returns: segments = (W=225, H=225)
  #          mask is where segments[:, :] == 1
  #          0 otherwise

  (C, W, H) = image.shape
  image = np.moveaxis(image, 0, -1)
  segment_image = np.zeros_like(image)

  for n in range(points.shape[0]):
    cur_points = []
    cv2.fillConvexPoly(segment_image, points[n], (255, 0, 0))
  
  if(show == True):
    plt.imshow(image)
    plt.imshow(segment_image, alpha=0.5)
    plt.show()
  
  ret = np.zeros((W, H))
  ret[segment_image.sum(axis=2) != 0] = 1
  return ret

#out = points_to_segmentation(x[0], y[0], show=True)
#plt.imshow(out)

In [9]:
# Compiles the data into train, val, test sets suitable for the segmentation net
# returns pytorch tensors for (x_train, y_train, x_val, y_val, x_test, y_test)
# Note:  For memory concerns, data tensors are kept on CPU
#        Transfer batches to/from cuda when using them
def compile_data(device=torch.device('cpu'), dtype=torch.float32):    
  total_data = 0
  for c in range(1, 11): # Retrieve images with different amounts of cards
    (cur_x, cur_y) = read_images(max_samples=10000, show_examples=False, cards_per_image=c)
    if(len(cur_x) == 0):
      continue
    (N, C, W, H) = cur_x.shape
    total_data = total_data + N

    # Turn the point labels into segmentation labels
    cur_y_masks = np.zeros((N, W, H))
    for img in range(N):
      cur_y_masks[img, :, :] = points_to_segmentation(cur_x[img], cur_y[img])

    # Add data to the collection
    if c == 1:
      all_np_x = cur_x
      all_np_y = cur_y_masks
    else:
      all_np_x = np.concatenate((all_np_x, cur_x))
      all_np_y = np.concatenate((all_np_y, cur_y_masks))
    
  assert total_data == all_np_x.shape[0]

  # Process data for the net
  np_sets = split_dataset(numpy_normalize_rgb(all_np_x), all_np_y)
  tensors = []
  for arr in np_sets:
    tensors.append(torch.tensor(arr).to(device, dtype))

  tensors.insert(0, all_np_x)
  tensors.insert(1, all_np_y)

  return tuple(tensors)


# The result of this function, stored as global variables
(all_imgs, all_labels, x_train, y_train, x_val, y_val, x_test, y_test) = compile_data()

X shape: (2861, 3, 225, 225)
Y shape: (2861, 1, 4, 2)
X shape: (36, 3, 225, 225)
Y shape: (36, 2, 4, 2)
X shape: (17, 3, 225, 225)
Y shape: (17, 3, 4, 2)
X shape: (8, 3, 225, 225)
Y shape: (8, 4, 4, 2)
X shape: (4, 3, 225, 225)
Y shape: (4, 5, 4, 2)
X shape: (5, 3, 225, 225)
Y shape: (5, 6, 4, 2)
X shape: (0,)
Y shape: (0,)
X shape: (4, 3, 225, 225)
Y shape: (4, 8, 4, 2)
X shape: (12, 3, 225, 225)
Y shape: (12, 9, 4, 2)
X shape: (2, 3, 225, 225)
Y shape: (2, 10, 4, 2)


In [10]:
for m in (x_train, y_train, x_val, y_val, x_test, y_test):
  print(m.shape)

torch.Size([2359, 3, 225, 225])
torch.Size([2359, 225, 225])
torch.Size([295, 3, 225, 225])
torch.Size([295, 225, 225])
torch.Size([295, 3, 225, 225])
torch.Size([295, 225, 225])


# Our SegNet Structure
This neural net is based around SegNet


**Sources used:**
*  SegNet paper - https://arxiv.org/pdf/1511.00561.pdf
  * Key Ideas:
  * Encoder portion is essentially the convolutional part of VGG16, pretrained on ImageNet
  * Pooling is achieved by Max Sampling
  * Blocks in the Decoder correspond to blocks in the Encoder with the same W/H, and upsampling to these blocks uses the indices of the max sampling in the corresponding encoder block
  * The upsampling used is sparse, in that all other values are 0s.
*  VGGNet structure - https://neurohive.io/en/popular-networks/vgg16/
  * Used for the design of the VGG-like SegNets
  * Their implementations are available below, but because of memory issues they aren't tested
* The paper for an earlier version of SegNet, now termed "SegNet_Basic" - https://arxiv.org/pdf/1505.07293.pdf

In [11]:
# A convolutional layer, combined with a batch norm and a ReLU

class ConvBlock(nn.Module):
  def __init__(self, channels_list, kernel_size, dtype=torch.float32, device=torch.device('cuda:0')):
    # channels_list:   A list of in/out channels, where channels_list[n] is the
    #                  in_channel for the n'th conv layer, as well as the
    #                  out_channel for the n-1'th conv layer.
    #                  Channels_list[0] is the in_channels for the block,
    #                  and channels_list[-1] is the out_channels for the block.
    # kernel_size:     Size of the kernel in each convolutional layer.
    super().__init__()
    layers = []
    for i in range(1, len(channels_list)):
      #Layers in a VGG16 net
      layers.append(nn.Conv2d(channels_list[i-1], channels_list[i], kernel_size=kernel_size, padding=math.floor(kernel_size/2)).to(device, dtype))
      layers.append(nn.BatchNorm2d(channels_list[i]).to(device, dtype))
      layers.append(nn.ReLU().to(device, dtype))
    self.net = nn.Sequential(*layers).to(device, dtype)
  

  def forward(self, x):
    return self.net(x)

In [12]:
# A block of convolutions in the encoder
# Ends with a max pool layer

class DownBlock(nn.Module):
  def __init__(self, channels_list, kernel_size, pooling_factor, dtype=torch.float32, device=torch.device('cuda:0')):
    # channels_list:   A list of in/out channels, where channels_list[n] is the
    #                  in_channel for the n'th conv layer, as well as the
    #                  out_channel for the n-1'th conv layer.
    #                  Channels_list[0] is the in_channels for the block,
    #                  and channels_list[-1] is the out_channels for the block.
    # kernel_size:     Size of the kernel in each convolutional layer.
    # pooling_factor:  The kernel/stride of the maxpool.
    #                  Divides the W/H of the image by pooling_factor.
    super().__init__()
    self.ConvNet = ConvBlock(channels_list, kernel_size, dtype=dtype, device=device).to(device, dtype)
    #Downsampling via Maxpool
    self.MaxPool = nn.MaxPool2d(pooling_factor, pooling_factor, return_indices=True).to(device, dtype)
    

  def forward(self, x):
    # input:  x = (N, C, W, H)
    # output: x_out = (N, C_out, W_out, H_out)
    #         indices = (N, C, W, H)? shouldn't need to deal with size here
    #         (W_in, H_in)
    x = self.ConvNet(x)
    (N, C, W, H) = x.shape
    (x, indices) = self.MaxPool(x)
    # For memory concerncs, temporarily take the indices off the gpu
    return x, indices.to(torch.device('cpu')), W, H

In [13]:
# A block of convolutions in the decoder
# Starts with an up-sampling layer (MaxUnpool)

class UpBlock(nn.Module):
  def __init__(self, channels_list, kernel_size, unpooling_factor, dtype=torch.float32, device=torch.device('cuda:0')):
    # channels_list:     A list of in/out channels, where channels_list[n] is the
    #                    in_channel for the n'th conv layer, as well as the
    #                    out_channel for the n-1'th conv layer.
    #                    Channels_list[0] is the in_channels for the block,
    #                    and channels_list[-1] is the out_channels for the block.
    # kernel_size:       Size of the kernel in each convolutional layer.
    # unpooling_factor:  the kernel_size/stride of the unpooling layer
    super().__init__()
    self.unpool = nn.MaxUnpool2d(kernel_size=unpooling_factor, stride=unpooling_factor).to(device, dtype)
    self.ConvNet = ConvBlock(channels_list, kernel_size, dtype=dtype, device=device).to(device, dtype)
    

  def forward(self, x, unpool_params):
    (indices, W, H) = unpool_params
    # Put the indices back on the gpu
    x = self.unpool(x, indices.to(device), output_size=torch.Size([x.shape[0], x.shape[1], W, H]))
    return self.ConvNet(x)

In [14]:
# The combined network

class OurSegNet(nn.Module):
  def __init__(self, net_params, dtype=torch.float32, device=torch.device('cuda:0')):
    # net_params:  A dictionary of parameters for the blocks in the net
    super().__init__()

    self.device = device
    self.dtype = dtype
    self.DownBlocks = torch.nn.ModuleList()
    self.UpBlocks = torch.nn.ModuleList()

    for n in net_params['DownBlocks']:
      self.DownBlocks.append(DownBlock(n['channels_list'], n['kernel_size'], n['pooling_factor'], dtype=dtype, device=device))
    for n in net_params['UpBlocks']:
      self.UpBlocks.append(UpBlock(n['channels_list'], n['kernel_size'], n['unpooling_factor'], dtype=dtype, device=device))
    if len(self.DownBlocks) != len(self.UpBlocks):
      raise RuntimeError('Mismatched number of blocks')
  

  def forward(self, x):
    # x: input of shape (N, 3, W, H)
    #forward pass through downsampling blocks
    pooling_params = []
    for d in self.DownBlocks:
      (x, indices, W, H) = d(x)
      pooling_params.insert(0, (indices, W, H))
    
    #forward pass through upsampling blocks
    for i in range(len(pooling_params)):
      x = self.UpBlocks[i](x, pooling_params[i])

    return x


In [15]:
# A dictionary of different SegNet network parameters
networks = {
    'first_net': {           # My implementation, based off of SegNet but with fewer layers/weights
        'DownBlocks':[       # Achieved ~98 val accuracy in 20 epochs on the Adam optimizer, batch_size=100
                             # ~116,342 parameters
                {
                    'channels_list' : [3, 6, 10],  #Remider: there are 2 ConvLayers here
                    'kernel_size' : 7,             #A (3->6) layer and a (6->10) layer
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [10, 10, 15],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [15, 15, 32],  
                    'kernel_size' : 5,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [32, 32, 64],  
                    'kernel_size' : 5,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64, 128],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                }
        ],
        'UpBlocks':[
                    {
                    'channels_list' : [128, 64, 64],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 32, 32],  
                    'kernel_size' : 5,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [32, 15, 15],  
                    'kernel_size' : 5,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [15, 10, 10],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [10, 6, 2],  #The final channels_out represents the possible classes
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                }
        ]
    },


    'VGG16' : {           # A structure more closely resembling VGG16, which is what the original SegNet uses
        'DownBlocks': [   # Unfortunately I keep running out of memory before I can use it.
                {    
                    'channels_list' : [3, 64, 64],  #Remider: there are 2 ConvLayers here;
                    'kernel_size' : 3,              #A (3->64) layer and a (64->64) layer
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 128, 128],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [128, 256, 256, 256],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [256, 512, 512, 512],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [512, 512, 512, 512],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                }
        ],
        'UpBlocks':[
                    {
                    'channels_list' : [512, 512, 512, 512],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [512, 512, 512, 256],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [256, 256, 256, 128],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [128, 128, 64],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64, 2],  #The final channels_out represents the possible classes
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                }
        ]
    },



    'VGG11' : {           # Like VGG16, but 5 fewer Conv layers and fewer weights
        'DownBlocks': [   # Also had memory issues with this one
                {    
                    'channels_list' : [3, 64],  
                    'kernel_size' : 3,              
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 128],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [128, 256, 256],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [256, 512, 512],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [512, 512, 512],  
                    'kernel_size' : 3,
                    'pooling_factor' : 2
                }
        ],
        'UpBlocks':[
                    {
                    'channels_list' : [512, 512, 512],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [512, 512, 256],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [256, 256, 128],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [128, 64],  
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 2],  #The final channels_out represents the possible classes
                    'kernel_size' : 3,
                    'unpooling_factor' : 2
                }
        ]
    },



    'SG_Basic' : {        # Based on an earler version of SegNet, called SegNet_Basic
        'DownBlocks': [   # The original SGB doesn't use ReLU's, unlike this one
                {         # Achieved 97.8% val acc in 20 epochs using SGD, batch_size=100
                    'channels_list' : [3, 64],  
                    'kernel_size' : 7,              
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                }
        ],
        'UpBlocks':[
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 2],  #The final channels_out represents the possible classes
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                }
        ]
    },



     'SGBv2' : {          # Like SegNet_Basic, but with more parameters and layers
        'DownBlocks': [   
                          # ~461,440 parameters
                {    
                    'channels_list' : [3, 64],  
                    'kernel_size' : 7,              
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64, 64],  
                    'kernel_size' : 7,
                    'pooling_factor' : 2
                }
                
        ],
        'UpBlocks':[
                {
                    'channels_list' : [64, 64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 64],  
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                },
                {
                    'channels_list' : [64, 2],  #The final channels_out represents the possible classes
                    'kernel_size' : 7,
                    'unpooling_factor' : 2
                }
        ]
    }

}

# Training the Net
The training net is stored in net_to_train

In [16]:
#Borrowed from homework2 (modified)
# Checks the accuracy of the model on either the validation set or the training set
# accuracy is the percentage of pixels over all samples that were classified correctly
def check_accuracy(model, mode='val'):
    if mode == 'val':
        print('Checking accuracy on validation set')
        x = x_val
        y = y_val
    elif mode == 'test':
        print('Checking accuracy on test set')   
        x = x_test
        y = y_test
    else:
      raise AttributeError('Bad accuracy check mode')

    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        x = x.to(torch.device('cuda:0'))
        y = y.to(torch.device('cuda:0'))

        scores = model(x)
        _, preds = scores.max(1)

        total = 1
        for s in preds.shape:
          total = total * s

        correct_preds = torch.zeros_like(preds)
        correct_preds[preds==y] = 1
        num_correct = correct_preds.sum()

        acc = float(num_correct)/total

        print('Got %d / %d correct (%.2f)' % (num_correct, total, 100 * acc))
        loss = torch.nn.functional.cross_entropy(scores, y.long())

        x = x.to(torch.device('cpu'))
        y = y.to(torch.device('cpu'))
        return loss

In [17]:
#Borrowed from homework2 (modified)
# Trains the given model with the given optimizer
def train_model(model, optimizer, max_batch_size=100, epochs=1, print_every=10):
    """  
    Inputs:
    - model:           A PyTorch Module giving the model to train.
    - optimizer:       An Optimizer object we will use to train the model
    - max_batch_size:  Maximum minibatch size
    - epochs:          (Optional) A Python integer giving the number of epochs to train for
    - print_every:     Number of GD steps between each printed update
    
    Returns: Nothing, but prints model accuracies during training.
    """
    total_data = x_train.shape[0]
    iter = 0
    train_losses = []
    val_losses = []
    for e in range(epochs):
        batch_index = 0
        #shuffle data before starting each epoch 
        (x_shuffled, y_shuffled) = shuffle_two_arrays(x_train, y_train)

        while batch_index < total_data:
            batch_end = batch_index + max_batch_size
            if batch_end > total_data:
              batch_end = total_data
            
            model.train()  # put model to training mode
            x = x_shuffled[batch_index:batch_end, :, :, :]
            y = y_shuffled[batch_index:batch_end, :, :]

            # Put data onto the cuda
            x = x.to(torch.device('cuda:0'))
            y = y.to(torch.device('cuda:0'))

            # Run model and calculate loss
            scores = model(x)
            loss = torch.nn.functional.cross_entropy(scores, y.long())
            train_losses.append(loss.item())

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if iter % print_every == 0:
                print('Epoch %d, GD Iteration %d, loss = %.4f' % (e, iter, loss.item()))
                val_loss = check_accuracy(model, mode='val')
                val_losses.append(val_loss)
                print()

            # Take the data off the cuda
            x = x.to(torch.device('cpu'))
            y = y.to(torch.device('cpu'))

            batch_index = batch_end
            iter = iter + 1
    
    plt.plot(range(iter), train_losses, label='training loss')
    plt.plot(np.linspace(0, iter, len(val_losses)), val_losses, label='validation loss')
    plt.legend()

In [18]:
# a dictionary of optimizer params
optim_params = {
    'Adam' : {
        'optimizer' : optim.Adam,
        'params' : (1e-3,(0.9,0.999))
    },

    'SGD' : {  # Optimizer used to train the original SegNet
        'optimizer' : optim.SGD,
        'params' : (0.1, 0.9)
    }
}

In [19]:
'''

   Train the net here

'''
net_to_train = OurSegNet(networks['SGBv2'])  #Pick the net to train from the networks dictionary
tp = optim_params['SGD']                     #Pick the optimizer to use from the optim_params dictionary

train_model(net_to_train, tp['optimizer'](net_to_train.parameters(), *tp['params']), epochs=20, max_batch_size=32, print_every=50)

Epoch 0, GD Iteration 0, loss = 0.7662
Checking accuracy on validation set


RuntimeError: CUDA out of memory. Tried to allocate 3.56 GiB (GPU 0; 6.00 GiB total capacity; 3.86 GiB already allocated; 533.62 MiB free; 3.98 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Finding the Bounding Box
These methods are used to take in a single segmentation mask and return one or more bounding boxes that represent the masks.

Created by Hunter Jensen, with minor edits to fit this code.

In [None]:
def displayBGR(image):
    plt.axis("off")
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.show()


def displayGRAY(image):
    plt.axis("off")
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_GRAY2RGB))
    plt.show()

In [None]:
# This function will create a line that starts at (x1, y1) and pass through (x2,y2), continuing to
# the boundaries of the image, which are defined by 0 and maxX and maxY.  Returns an array of points
# (pixels) on that line.
def get_line_long(x1, y1, x2, y2, maxX, maxY):
    minX = 0
    minY = 0

    points = []
    issteep = abs(y2 - y1) > abs(x2 - x1)
    if issteep:
        x1, y1 = y1, x1
        x2, y2 = y2, x2
    rev = False
    if x1 > x2:
        x1 = -x1    # Invert x and y so line is always going in positive direction
        x2 = -x2
        y1 = -y1
        y2 = -y2
        rev = True
    deltax = x2 - x1
    deltay = abs(y2 - y1)
    error = int(deltax / 2)
    y = y1
    ystep = None
    if y1 < y2:
        ystep = 1
    else:
        ystep = -1

    complete = False
    x = x1  # Start at x1
    while not complete:
        # if not rev:
        if issteep:
            if not rev:
                points.append((y, x))
                if (x <= minY) or (x >= maxY) or (y <= minX) or (y >= maxX):
                    complete = True
            elif rev:
                points.append((-y, -x))
                if (x >= -minY) or (x <= -maxY) or (y >= -minX) or (y <= -maxX):
                    complete = True
        else:
            if not rev:
                points.append((x, y))
                if (x <= minX) or (x >= maxX) or (y <= minY) or (y >= maxY):
                    complete = True
            elif rev:
                points.append((-x, -y))
                if (x >= -minX) or (x <= -maxX) or (y >= -minY) or (y <= -maxY):
                    complete = True
        error -= deltay
        if error < 0:
            y += ystep
            error += deltax
        x += 1  # Increment x

    return points

In [None]:
def get_line(x1, y1, x2, y2):
    points = []
    issteep = abs(y2 - y1) > abs(x2 - x1)
    if issteep:
        x1, y1 = y1, x1
        x2, y2 = y2, x2
    rev = False
    if x1 > x2:
        x1, x2 = x2, x1
        y1, y2 = y2, y1
        rev = True
    deltax = x2 - x1
    deltay = abs(y2 - y1)
    error = int(deltax / 2)
    y = y1
    ystep = None
    if y1 < y2:
        ystep = 1
    else:
        ystep = -1
    for x in range(x1, x2 + 1):
        if issteep:
            points.append((y, x))
        else:
            points.append((x, y))
        error -= deltay
        if error < 0:
            y += ystep
            error += deltax
    # Reverse the list if the coordinates were reversed
    if rev:
        points.reverse()
    return points

In [None]:
# (X[i], Y[i]) are coordinates of i'th point.
def polygonArea(X, Y):
    n = len(X)
    # Initialize area
    area = 0.0

    # Calculate value of shoelace formula
    j = n - 1
    for i in range(0, n):
        area += (X[j] + X[i]) * (Y[j] - Y[i])
        j = i  # j is previous vertex to i

    # Return absolute value
    return abs(area / 2.0)



# Just calls polygonArea
def area_of_box(box):
    return polygonArea((box[0][0], box[1][0], box[2][0], box[3][0]), (box[0][1], box[1][1], box[2][1], box[3][1]))



# Like area_of_box but so it can easily be called to check a new line
# Side is the side you're working on, point1 and point2 are the new points you're testing,
# box is the original box that you're replacing one edge of
def area_of_new_box(side, point1, point2, box):
    # IMPORTANT: point2 must follow point1 clockwise
    if side == 'left':
        return polygonArea((point1[0], point2[0], box[2][0], box[3][0]), (point1[1], point2[1], box[2][1], box[3][1]))
    elif side == 'top':
        return polygonArea((box[0][0], point1[0], point2[0], box[3][0]), (box[0][1], point1[1], point2[1], box[3][1]))
    elif side == 'right':
        return polygonArea((box[0][0], box[1][0], point1[0], point2[0]), (box[0][1], box[1][1], point1[1], point2[1]))
    elif side == 'bottom':
        return polygonArea((point2[0], box[1][0], box[2][0], point1[0]), (point2[1], box[1][1], box[2][1], point1[1]))
    else:
        exit('Error: bad side in area_of_new_box: ' + str(side))

In [None]:
# Image passed in must be GRAY, not BGR
# Returns false if no conflicts
def check_box_for_conflicts(box, img):
    left_side = get_line(box[0][0], box[0][1], box[1][0], box[1][1])
    top_side = get_line(box[1][0], box[1][1], box[2][0], box[2][1])
    right_side = get_line(box[2][0], box[2][1], box[3][0], box[3][1])
    bottom_side = get_line(box[3][0], box[3][1], box[0][0], box[0][1])
    all_sides = left_side + top_side + right_side + bottom_side
    for pix in all_sides:
        if img[pix[1]][pix[0]] != 0:
            # print("Problem: " + str(pix[1]) + "," + str(pix[0]]))
            return True
    return False



# Like check_box_for_conflicts, but checks one line (pass in two points)
# Image passed in must be GRAY, not BGR
# Returns false if no conflicts
def check_line_for_conflicts(point1, point2, img):
    line = get_line(point1[0], point1[1], point2[0], point2[1])
    for pix in line:
        if img[pix[1]][pix[0]] != 0:
            # print("Problem: " + str(pix[1]) + "," + str(pix[0]]))
            return True
    return False

In [None]:
# First pushes the edge out until it doesn't conflict, then rotates right and then left to find minimum area
def rotateCaliperLine(side, top_line, bottom_line, starting_top_point, starting_bottom_point, box, img):
    min_area = 999999999  # This will contain the minimum area of the box
    min_points = None  # This will be an array of points that give the min_area

    # Display the box before rotation (optional)
    # img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    # cv2.drawContours(img_bgr, [box], 0, (0, 0, 255), 1)
    # displayBGR(img_bgr)

    # Check starting point
    complete = False
    while not complete:
        # If there are conflicts
        try:
            if check_line_for_conflicts(top_line[starting_top_point], bottom_line[starting_bottom_point], img):
                # If so, push back and repeat
                starting_top_point += 1
                starting_bottom_point += 1
                # If can't push back further, you'll error out in the except block
            # Once you're not conflicting get your starting min_area
            else:
                point1 = top_line[starting_top_point]
                point2 = bottom_line[starting_bottom_point]
                min_area = area_of_new_box(side, point1, point2, box)
                min_points = [point1, point2]
                complete = True
        except IndexError:
            exit("ERROR: for side " + str(
                side) + " we couldn't find a starting line that didn't conflict and was in bounds")
    # print('* Starting point: ' + str(top_line[starting_top_point]) + " " + str(bottom_line[starting_bottom_point]) + " *")

    # Display output lines! (Optional)
    # img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    # cv2.line(img_bgr, min_points[0], min_points[1], (0, 255, 0), 1)
    # displayBGR(img_bgr)

    # Then rotate right (by decrementing the bottom)
    #    /------/ | ->
    #   /------/  |
    #  /------/   |
    # /______/    | <-
    top_point = starting_top_point
    bottom_point = starting_bottom_point
    complete = False
    while not complete:
        try:
            bottom_point -= 1  # Decrement the bottom point
            # Shift right until you don't conflict
            while check_line_for_conflicts(top_line[top_point], bottom_line[bottom_point], img):
                top_point += 1
                bottom_point += 1
            # Now that you don't conflict, see if it's a new minimum area!
            point1 = top_line[top_point]
            point2 = bottom_line[bottom_point]
            if area_of_new_box(side, point1, point2, box) <= min_area:  # FIXME: <= or < ?
                min_area = area_of_new_box(side, point1, point2, box)
                min_points = [point1, point2]
                # print("New min points: " + str(min_points))
        except IndexError:
            # Go until your top_point goes out of bounds
            complete = True

    # Go back to start and rotate left
    top_point = starting_top_point
    bottom_point = starting_bottom_point
    complete = False
    while not complete:
        try:
            top_point -= 1  # Decrement the *TOP* point
            # Shift right until you don't conflict
            while check_line_for_conflicts(top_line[top_point], bottom_line[bottom_point], img):
                top_point += 1
                bottom_point += 1
            # Now that you don't conflict, see if it's a new minimum area!
            point1 = top_line[top_point]
            point2 = bottom_line[bottom_point]
            if area_of_new_box(side, point1, point2, box) <= min_area:  # FIXME: <= or < ?
                min_area = area_of_new_box(side, point1, point2, box)
                min_points = [point1, point2]
                # print("New min points: " + str(min_points))
        except IndexError:
            # Go until your top_point goes out of bounds
            complete = True

    # Display output lines! (Optional)
    # img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    # cv2.line(img_bgr, min_points[0], min_points[1], (0, 255, 0), 1)
    # displayBGR(img_bgr)

    return min_points

In [None]:
# Takes in which side of box to work with.  It rotates that side as far as it can to both degrees,
# optimizing for minimum box area
def rotateCaliper(side, box, img):
    top_line = None  # Will be an array of points
    bottom_line = None  # Will be an array of points
    relative_top_idx = None
    relative_bottom_idx = None
    rel_top_far_idx = None  # The index of far top corner
    rel_bot_far_idx = None  # Index of far bottom corner

    if side == 'right':

        relative_top_idx = 2
        relative_bottom_idx = 3
        rel_top_far_idx = 1
        rel_bot_far_idx = 0
    elif side == 'bottom':
        relative_top_idx = 3
        relative_bottom_idx = 0
        rel_top_far_idx = 2
        rel_bot_far_idx = 1
    elif side == 'left':
        relative_top_idx = 0
        relative_bottom_idx = 1
        rel_top_far_idx = 3
        rel_bot_far_idx = 2
    elif side == 'top':
        relative_top_idx = 1
        relative_bottom_idx = 2
        rel_top_far_idx = 0
        rel_bot_far_idx = 3
    else:
        exit("In function rotateCaliper, the input side is not recognized: " + str(side))

    # Use get_line_long to get top and bottom line:
    top_line = get_line_long(box[rel_top_far_idx][0], box[rel_top_far_idx][1], box[relative_top_idx][0], box[relative_top_idx][1], img.shape[1]-1, img.shape[0]-1)
    bottom_line = get_line_long(box[rel_bot_far_idx][0], box[rel_bot_far_idx][1], box[relative_bottom_idx][0], box[relative_bottom_idx][1], img.shape[1]-1, img.shape[0]-1)

    # FIXME: Remove this try block later once you know it's always true
    try:  # Ensure that the arrays are in the right direction
        top_closePoint = top_line.index((box[relative_top_idx][0], box[relative_top_idx][1]))
        top_farPoint = top_line.index((box[rel_top_far_idx][0], box[rel_top_far_idx][1]))
        bot_closePoint = bottom_line.index((box[relative_bottom_idx][0], box[relative_bottom_idx][1]))
        bot_farPoint = bottom_line.index((box[rel_bot_far_idx][0], box[rel_bot_far_idx][1]))
        if top_closePoint < top_farPoint:
            top_line.reverse()  # Reverse the array so that incrementing the index will go away from our close point
            print("REVERSING TOP ARRAY")
        if bot_closePoint < bot_farPoint:
            bottom_line.reverse()  # Do the same for bottom array as well
            print("REVERSING BOTTOM ARRAY")
    except ValueError:
        # This always fails when one of the points is outside - of the box, FIXME later?
        # exit("ERROR: Point not found on either top or bottom line...")
        print("ERROR: Point not found within bounds")
        return

    starting_top_point = top_line.index((box[relative_top_idx][0], box[relative_top_idx][1]))
    starting_bottom_point = bottom_line.index((box[relative_bottom_idx][0], box[relative_bottom_idx][1]))

    min_points = rotateCaliperLine(side, top_line, bottom_line, starting_top_point, starting_bottom_point, box, img)

    # Update box with new minimum area points
    box[relative_top_idx][0] = min_points[0][0]
    box[relative_top_idx][1] = min_points[0][1]
    box[relative_bottom_idx][0] = min_points[1][0]
    box[relative_bottom_idx][1] = min_points[1][1]

In [None]:
# Will minimize box around a non-black object in img
# Works by attempting to rotate each edge for minimal area
def geometricMinimizeQuad(box, img):
    loops = 3
    min_box = np.copy(box)
    for i in range(loops):
        rotateCaliper('right', min_box, img)
        rotateCaliper('bottom', min_box, img)
        rotateCaliper('left', min_box, img)
        rotateCaliper('top', min_box, img)
    return min_box

In [None]:
# img must be GRAY, not BGR
# This implements an algorithm to shrink the bounding box around a non-black object in img
# Currently calls geometricMinimizeQuad, originally it called recursive_min_box but it wasn't
# very efficient so that method is not recommended.
def minEnclosingQuad(cnt, img, show_before=False, show_after=False):
    # Fill in contour of picture with white
    cv2.fillPoly(img, pts=[cnt], color=(255, 255, 255))

    # Get coordinates
    rect = cv2.minAreaRect(cnt)
    box = cv2.boxPoints(rect)
    box = np.int0(box)

    if show_before:
        # Display the box (optional)
        img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        cv2.drawContours(img_bgr, [box], 0, (0, 0, 255), 1)
        displayBGR(img_bgr)

    minBox = geometricMinimizeQuad(box, img)
    # print("done!")

    if show_after:
        # Display the box (optional)
        img_bgr = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        cv2.drawContours(img_bgr, [minBox], 0, (0, 0, 255), 1)
        displayBGR(img_bgr)

    return minBox

In [None]:
def getMinQuad(img, max_quads=10, value=210, show_before=False, show_after=False):
    height, width, _ = img.shape
    border = round(max(height, width)/2)

    hsv_image = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    hsv_lower = np.array([0, 0, value], dtype="uint8")
    hsv_upper = np.array([255, 255, 255], dtype="uint8")
    # find the colors within the specified boundaries and apply the mask
    mask = cv2.inRange(hsv_image, hsv_lower, hsv_upper)
    if show_before:
        plt.imshow(mask)
        plt.show()
    # Make image larger, add in border around each side
    mask = cv2.copyMakeBorder(mask, border, border, border, border, cv2.BORDER_CONSTANT, (0,0,0))

    contours = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours = imutils.grab_contours(contours)

    # Adjust num_quads so that it isn't larger than the number of contors
    if max_quads > len(contours):
      num_quads = len(contours)
    else:
      num_quads = max_quads
    
    contour = sorted(contours, key=cv2.contourArea, reverse=True)[:num_quads]  # Only take the biggest contour
    min_quads = np.zeros((num_quads, 4, 2), dtype=int)

    for i, cnt in enumerate(contour):
        min_quad = minEnclosingQuad(cnt, mask, show_before=show_before, show_after=show_after)
        min_quad = min_quad - border
        min_quads[i] = min_quad

    if show_after:
        cv2.drawContours(img, [min_quads[0]], 0, (0, 0, 255), 2)    # Showing more than one throws errors because contours are technically lists of arrays, not a 3d numpy array
        displayBGR(img)

    return min_quads    # Shape [num_quads, 4, 2]


# Example call: 
# filename = "003526.jpe"
# image = cv2.imread(filename)
# mq = getMinQuad(image, 3)
# print(mq)   # Should be shape [3, 4, 2] since we asked for 3 cards in the previous line

# Displaying the results

In [None]:
# Takes in a mask (W, H) or (1, W, H) and returns an image (W, H, 3) 
# If mask is a pytorch tensor, call mask.to(torch.device('cpu')) first
def mask_to_rgb(mask):
  W, H = mask.shape[-2:]
  img = np.zeros((W, H, 3), dtype=np.uint8)
  img[:, :, 0] = mask
  img[img != 0] = 255
  return img

In [None]:
# Run a single image through the net and display the results
# Inputs:
# img         - an ndarray (C, W, H) representing a single image from all_imgs
# label       - the corresponding label (W, H), from all_labels
# model       - the model to run the image on
# showCorrect - If true, also shows the correct segmentation (separately)
# show_img    - If true, layers the segmentation on top of the original img
#               If false, just shows the mask by itself
def display_example_segment(img, label, model, showCorrect=False, show_img=True):
  net_input = numpy_normalize_rgb(img.reshape(1, *img.shape))
  net_output = model(torch.tensor(net_input).to(torch.device('cuda:0'), torch.float32))
  img = np.moveaxis(img, 0, -1)

  if(showCorrect):
    print('Correct segmentation:')
    correct_mask = mask_to_rgb(label)
    if show_img:
      plt.imshow(img)
    plt.imshow(correct_mask, alpha=0.75)
    plt.show()

  print('SegNet segmentation:')
  _, preds = net_output.max(1)
  net_mask = mask_to_rgb(preds.to(torch.device('cpu')))
  if show_img:
    plt.imshow(img)
  plt.imshow(net_mask, alpha=0.75)
  plt.axis('off')
  plt.show()


In [None]:
# Takes in an img from all_imgs, runs it through the net,
# and displays the resulting bounding box
def display_example_bound(img, model, max_boxes=1):
  net_input = numpy_normalize_rgb(img.reshape(1, *img.shape))
  net_output = model(torch.tensor(net_input).to(torch.device('cuda:0'), torch.float32))
  img = np.moveaxis(img, 0, -1)
  _, preds = net_output.max(1)
  quads = getMinQuad(mask_to_rgb(preds.to(torch.device('cpu'))), max_quads=max_boxes) # shape (num_cards, 4, 2)
  print(str(max_boxes), 'Approximate bounding boxes:')
  displayRGB(img, quads)


In [None]:
'''

   Visualize some example segmentation and bounding box output here
  
   Toggle show_img if you want to see the segmentation layered on top
   of the original image


'''


# displays some of the images and their results
# index is the image's index in all_imgs, which is compiled in the compile_data function
total = len(all_imgs)
for i in range(5):
  index = round(random.uniform(0, total))
  # index = total - i - 30
  print('\nindex =',index)
  display_example_segment(all_imgs[index], all_labels[index], net_to_train, show_img=False) # Shows segmentation results
  display_example_bound(all_imgs[index], net_to_train, max_boxes=1)                         # Shows the approximte bounding boxes of the results

In [None]:
def check_single_accuracy(pred, correct, max_dist=2):
  """
  Compute the accuracy one classification.  Pred and correct are numpy arrays
  of shape [1, 4, 2].  Returns 1 if it was correct (worst corner within max_dist)
  and 0 otherwise
  """
  diff = correct - pred
  diff = np.all(np.all(diff<=max_dist, axis=2),axis=1)
  num_correct = np.sum(diff)
  return num_correct

In [None]:
x_full, y_full = read_images(max_samples = 200, cards_per_image=1)

In [None]:
def reorder_coords(coords):
  '''
  Takes in the coordinates for 1 cards and rolls it so that they are in the right
  order (The minimum point, aka top left, should be at index 1)
  '''
  min_matrix = np.sum(coords, axis=2)
  roll_num = -(np.argmin(min_matrix) - 1)    # The minimum should be in position 1 (since order is bottom left, top left, top right, bottom right)
  new_coords = np.roll(coords, roll_num, axis=1)
  return new_coords

In [None]:
def check_box_accuracy(x, y, max_dist=2):
  '''
  Checks the accuracy of the estimated bounding boxes
  '''
  num_correct = 0
  num_samples = x.shape[0]
  for i in range(num_samples):
    if (i%50 == 0) and (i != 0):
      print(i)
    img = x[i]
    coords = y[i]
    net_input = numpy_normalize_rgb(img.reshape(1, *img.shape))
    net_output = net_to_train(torch.tensor(net_input).to(torch.device('cuda:0'), torch.float32))
    _, preds = net_output.max(1)
    quads = getMinQuad(mask_to_rgb(preds.to(torch.device('cpu'))), max_quads=1) # shape (num_cards, 4, 2)
    correct = check_single_accuracy(reorder_coords(quads), coords, max_dist=max_dist)
    num_correct += correct
  acc = num_correct/num_samples
  print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
  return acc

In [None]:
'''

Resulting bounding box accuracy shown here,
for different pixel distances.

'''
acc = check_box_accuracy(x_full, y_full, max_dist=5)
acc = check_box_accuracy(x_full, y_full, max_dist=4)
acc = check_box_accuracy(x_full, y_full, max_dist=3)
acc = check_box_accuracy(x_full, y_full, max_dist=2)
acc = check_box_accuracy(x_full, y_full, max_dist=1)
acc = check_box_accuracy(x_full, y_full, max_dist=0)