In [7]:
import torch
import urllib
from PIL import Image
import torchvision
from torchvision import transforms
import torch
import torch.nn as nn
from torchvision.models import resnet50, squeezenet1_1

In [8]:
import requests
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

In [9]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x24425a090b8>

In [10]:
model = squeezenet1_1(pretrained=True)
# model.eval()

## Object Detection with SqueezeNet backbone
Adding object detection layers to squeezenet backbone for quick object detection

#### Multibox:
- Consists of two components:
    - Coordinates of a box that may or may not contain an object
    - scores for various predicted classes ~ classification task
    
#### Single Shot Detector (SSD):
- Base Convolutions: creates lower-level feature maps
- Auxiliary Convolutions: Higher level feature maps
- Prediction Convolutions: predicting the bounding boxes and classes



### Considerations

- Input size is 64x64

All feature maps are going to have priors/anchor boxes that are going to be tuned by the model based on the groundtruth labels.
A prior with a scale s, then the area is going to be of a square with side s. thelarger featur maps have smaller priors so they are great for detecting smaller objects.


Priors/Anchor boxes have two aspects:
- Scale: which is dependent on the image size. For example, for a prior with a scale of 0.1 means that it is 10% of the image dimensions. The largest feature maps would have anchors/priors that are 10% of the map size. After that, the anchors would have sales ranging from 20% to 90% of the image dimensions. So, after the scale is defined, we have the aspect ratio which is if the ratio is 1:1, we have a square anchor. If the ratio is 2:1, the width would be 2x the scale and the height would be the same. 

So, calculating the width and height from scale and aspect ratios is:
<!-- Code Block -->
```
w * h = s^2 
w/h = a
```
So, we get width by solving for w using the two equations and we get:
<!-- Code Block -->
```
w = s * sqrt(a)
h = s / sqrt(a)
```

Since the anchors are going to be predicted at each point on each feature map, we are going to have 5 x 5 points for a feature map so 25points, and on each point we are going to have 5 anchor boxes predicted so we would have 75 anchor boxes and for each anchor box we are going to predict two things:
1 - coordinates i.e. .[c_x, c_y, w, h]
2. Class scores: n_class scores where n_class is the total number of object types including the background class.
tidbit: I believe we can lower the number of classes i.e. if 10 classes exist but we only want the classes with highest probabilities of maybe 3, we can lower the number of boxes as well. 


THEN!
We are going to have 2 convolution layers i.e.:
1. To assess and calculate the 4 numbers for the bounding box that was predicted by the prior. It would have 3x3 kernel, stirde and padding of 1 and 4 filters where each filter takes care of the 1 bounding box.
2. The second convolution layer would have 3x3 kernel with a number of filters equal to n_classes where each filter has weight for each class and assesses whether the object belonging to that class exists in the box.  So each point in the 5x5 output map would have 24 channels where the first 4 channels have 1 value each that represents [c_x, c_y, w, h] of the first prior box and so on for the 6 total prior boxes. Hence, the number of channels being 4x6 = 24 channels.


# Loss Function
The priors/anchor boxes are compared with the ground truth boxes. Based on the number of feature maps that we are going to generate the anchor boxes for, we could have thousands of priors. We compare those with the ground truth labels by using intersection over union. If the number is higher than 0.5 i.e. the intersection of the boxes are is larger than the union area by a ratio of 0.5, we have a positive prior so we keep those,all the other we reject. 
Now start the regression task i.e. the positive priors now have a ground truth box that they can compare and we are trying to regress towward that box so that the loss is the lowest and the positive prior is exactly as the ground truth box. 

So, we compare the coordinates of the positive box with the coordinates of the ground truth box and calculate the loss that way. 
We calculate a localization loss which is the average smooth l1 loss between the encoded offsets of the positively matched localization boxes and their ground truth labels.


![Localizatiob Loss](localization_loss.PNG)

# Classification
The second step obviously is to give the correct label to the predicted prior box:
As indicated already, we would get a feature map that would have 5x 5xn_classes number of channels where each channel corresponds to score for each class. 

Each predictions, whether +ve or -ve is going to have a label associated with it. Considering there are only a few objects in the image we are going to have a lot more box with ground truth of background. But if we only think about the positive labels and reject all the rest, the model is going to try to predict labels in every single prior. So we take care of that by keeping some of the negative labels such that negative labels are 3 times the number of positive labels. And we only choose the ones that the model was most wrong about i.e. they did not overlap the ground truth object boxes at alll!
We find the hard negatives by cross entropy loss for each negatively matched prediction and choose te top hard negative losses.

Then, the confidence score is just the sum of Cross Entropy loss among the positive and hard negative matches:


![Classification Loss](classification_loss.png)

# Total Loss:

loss = Confidence_loss + alpha * localization loss

* We could just set alpha to 1 and so, loss would be the confidence loss added to localization loss

# Processing Predictions:

We have two tenors, one contains the bounding boxes and one contains the classes:

So, we decode the offsets from the first tensor for bounding boxes
Then, we extract the scores for the non-background class from each of the priors ( 8732 boxes)
Eliminate boxes that do not meet the criteria for the threshold of the score
The remaining boxes are candidates for the particular class of object.

Then, we perform non-maximum suppression. We line up the candidates i.e. boxes for each class in terms of how likely they are i.e. the probability. We only take the boxes with the maximum score.




In [None]:
for params in model.parameters():
    print(params)

In [3]:
url, filename = ("https://github.com/pytorch/hub/raw/master/dog.jpg", "dog.jpg")

try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)

In [4]:
input_image = Image.open(filename)


In [6]:
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.483, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)

if torch.cuda.is_available():
    input_batch = input_batch.to('cuda')
    model.to('cuda')
    
with torch.no_grad():
    output = model(input_batch)

# print(output[0])

output_tensor = (torch.nn.functional.softmax(output[0], dim=0))

In [10]:
len(output_tensor)

1000

In [11]:
output_tensor.min()

tensor(5.0700e-16)

In [20]:
(output_tensor.max().data)

tensor(0.9039)

In [15]:
type(output_tensor)

torch.Tensor

In [9]:
output_tensor[500:560]

tensor([2.0729e-14, 1.9949e-10, 7.2010e-12, 7.1363e-14, 1.7973e-13, 7.0274e-14,
        1.5274e-14, 2.5089e-15, 1.0006e-09, 1.0136e-14, 8.9804e-16, 1.6068e-13,
        1.2220e-15, 3.0954e-13, 8.5692e-14, 6.1191e-11, 1.8913e-11, 3.7359e-15,
        7.6298e-14, 3.6036e-11, 1.2279e-11, 1.7311e-13, 6.2765e-12, 8.4972e-12,
        4.2945e-13, 1.2898e-12, 1.0878e-13, 2.8083e-13, 1.1896e-14, 2.9645e-11,
        4.7876e-14, 5.7840e-14, 1.4019e-13, 2.3822e-14, 4.5971e-09, 8.9212e-15,
        1.0162e-15, 1.3309e-08, 2.6088e-14, 1.8716e-07, 6.9155e-16, 7.7608e-14,
        3.5605e-14, 1.0392e-12, 1.0367e-13, 9.4117e-15, 1.2659e-13, 2.1207e-15,
        2.5874e-14, 1.0264e-13, 2.3916e-15, 1.5477e-12, 7.4955e-09, 7.7580e-13,
        1.6265e-12, 5.0764e-12, 3.0446e-15, 2.3276e-13, 9.0040e-13, 6.7769e-14])

In [26]:
import numpy as np

In [27]:
n_array = output_tensor.numpy()

In [32]:
output_tensor[65]

tensor(7.3286e-15)

In [30]:
n_array[70]

1.2789478e-15

# SqueezeNet + DETR

In [108]:
from torchvision.models import squeezenet1_1

In [109]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f908cda4e80>

In [117]:
class DETR(nn.Module):
    
    def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6,
                num_decoder_layers=6):
        
        super().__init__()
        # get the list of squeezenet conv layers minus the last two i.e. avgpool & FC w/ classes
        # self.backbone = nn.Sequential(*list(squeezenet1_1(pretrained=True).children())[:-2])
        self.backbone = squeezenet1_1()
        del self.backbone.classifier
        # in channels, out channels, kernel size, 2048 is from the last layer of SqueezeNett
        self.conv = nn.Conv2d(2048, hidden_dim, 1)
        # no. of heads in multiheadattention model
        # sub-encoder-layers in the encoder
        # sub-decoder-layers in the decoder
        self.transformer = nn.Transformer(hidden_dim, nheads,
                                          num_encoder_layers, num_decoder_layers) 
        # two linear layers 1 for class and 1 for the box
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)   # num_classes + no class so 1 for that
        self.linear_bbox = nn.Linear(hidden_dim,4)
        
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        
    def forward(self, inputs):
        x = self.backbone.conv1(inputs)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)
        
        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)
        
        # convert from 2048 to 256 feature planes for the transformer
        h = self.conv(x)
        
        # Construct Positional Encodings
        # H is the number of rows
        # W is the number of channels of the output tensor
        H, W = h.shape[-2:]
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)
        # propagate through the transformer
        h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
                             self.query_pos.unsqueeze(1)).transpose(0, 1)
        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h),
                'pred_boxes': self.linear_bbox(h).sigmoid()}
    

In [118]:
detr = DETR(num_classes=91)

In [107]:
detr = DETR(num_classes=91)
state_dict = torch.hub.load_state_dict_from_url(
            url='https://dl.fbaipublicfiles.com/detr/detr_demo-da2a99e9.pth',
            map_location='cpu',check_hash = True)

In [77]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f908ce20e50>

In [93]:
detr.load_state_dict(state_dict)
detr.eval();

In [101]:
# COCO classes
classes = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

In [95]:
# standard PyTorch mean-std input image normalization

transform = transforms.Compose([
    transforms.Resize(800),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

In [96]:
def detect(im, model, transform):
    # normalize the image
    img = transform(im).unsqueeze(0)
    # propagate through the model
    outputs = model(img)
    
    # keep only the predictions with confidence > 0.7 
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.7
    
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
    return probas[keep], bboxes_scaled


In [97]:
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
im = Image.open(requests.get(url, stream=True).raw)
scores, boxes = detect(im, detr, transform)

In [147]:
# scores

In [99]:
boxes

tensor([[1.4237e+01, 5.2648e+01, 3.0968e+02, 4.7448e+02],
        [4.0579e+01, 7.2045e+01, 1.7505e+02, 1.2057e+02],
        [3.4342e+02, 2.1900e+01, 6.3954e+02, 3.7115e+02],
        [4.2231e-01, 1.1729e+00, 6.4028e+02, 4.7372e+02],
        [3.3136e+02, 7.4511e+01, 3.6977e+02, 1.9231e+02]])

In [16]:
transformer_model.conv

Conv2d(2048, 3, kernel_size=(1, 1), stride=(1, 1))

In [17]:
nn.Sequential(*list(squeezenet1_1(pretrained=True).children())[:-2])

Sequential()

In [18]:
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)

Downloading: "https://github.com/facebookresearch/detr/archive/master.zip" to /home/sunshine/.cache/torch/hub/master.zip
Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /home/sunshine/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth
100.0%
Downloading: "https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth" to /home/sunshine/.cache/torch/hub/checkpoints/detr-r50-e632da11.pth
100.0%


In [23]:
model = torchvision.models.squeezenet1_1(pretrained=True)

Downloading: "https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth" to /home/sunshine/.cache/torch/hub/checkpoints/squeezenet1_1-f364aa15.pth
100.0%


In [28]:
model=DETR(num_classes=91, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6)


# Squeezenet Weights


In [104]:
model = torch.hub.load('pytorch/vision', 'squeezenet1_0', pretrained=True)

Downloading: "https://github.com/pytorch/vision/archive/master.zip" to /home/sunshine/.cache/torch/hub/master.zip


model.eval()

# SqueezeNet backbone + DETR


In [153]:
class DETR(nn.Module):
    
    def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6,
                num_decoder_layers=6):
        
        super().__init__()
        # get the list of squeezenet conv layers minus the last two i.e. avgpool & FC w/ classes
        # self.backbone = nn.Sequential(*list(squeezenet1_1(pretrained=True).children())[:-2])
#        self.backbone = nn.Sequential(*list(squeezenet1_1(pretrained=True).children())[:-4])
#        self.backbone = torchvision.models.squeezenet1_1(pretrained=True)
#       del self.backbone.classifier
        # in channels, out channels, kernel size, 2048 is from the last layer of SqueezeNett
        self.conv = nn.Conv2d(512, hidden_dim, 1)
        # no. of heads in multiheadattention model
        # sub-encoder-layers in the encoder
        # sub-decoder-layers in the decoder
        self.transformer = nn.Transformer(hidden_dim, nheads,
                                          num_encoder_layers, num_decoder_layers) 
        # two linear layers 1 for class and 1 for the box
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)   # num_classes + no class so 1 for that
        self.linear_bbox = nn.Linear(hidden_dim,4)
        
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        
    def forward(self, inputs):
        x = self.backbone.conv1(inputs)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)
        
        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)
        
        # convert from 2048 to 256 feature planes for the transformer
        h = self.conv(x)
        
        # Construct Positional Encodings
        # H is the number of rows
        # W is the number of channels of the output tensor
        H, W = h.shape[-2:]
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1)
        # propagate through the transformer
        h = self.transformer(pos + 0.1 * h.flatten(2).permute(2, 0, 1),
                             self.query_pos.unsqueeze(1)).transpose(0, 1)
        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h),
                'pred_boxes': self.linear_bbox(h).sigmoid()}

In [154]:
detr = DETR(num_classes=91)

# detr.eval()

# NOTE
NOTE:   DETR class does not have Squeezenet1_1 as the model
instead, we define the DETR class like normal layers and forward method
AFTER THAT, we put the model in the model_train function and append the DETR class so that we only train THE NEW layers on imagenet or coco for object detection

# Update

Get the loss function for the DETR model
Define it if need be
The probs and boxes are defined for the output but need to do loss function. 


In [None]:
def train_model(model, loss_function, optimizer, data_loader):
    """ Define the training function for the model"""
    model.train()
    
    # set the loss and accuracy:
    current_acc = 0
    current_loss = 0
    
    # iterate over the dataset
    for i, (inputs, labels) in enumerate(data_loader):
        # send them to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # zero the paraeter gradients
        optimizer.zero_grad()
        
        with torch.set_grad_enabled(True):
            # forward pass
            ouputs = model(inputs)
            probs = outputs['pred_logits'].softmax(-1)[0, :, :-1]
            keep = probs.max(-1).values > 0.7                      # only keep probs > 0.7
            
            

In [None]:
def model_train(epochs=5):
    """ Model training """
    
    model = torchvision.models.squeezenet1_1(pretrained=True)
    
    for param in model.parameters():
        param.requires_grad = False 
        
    detr = DETR(num_classes=91)
    model.classifier = detr
    
    # transfer model to GPU or CPU
    device = torch.device("cuda:0" if torch.cuda.is_avilable() else "cpu")
    model = model.to(device)
    
    loss_function = nn.CrossEntropyLoss()
    
    optimizer = optim.Adam(model.fc.parameters())
    
    test_accc = []
    
    for epoch in range(epochs):
        
    return model.eval()
    

In [155]:
def model_train(epochs=5):
    """ Model training """
    
    model = torchvision.models.squeezenet1_1(pretrained=True)
    
    for param in model.parameters():
        param.requires_grad = False 
        
    detr = DETR(num_classes=91)
    model.classifier = detr
    
    # transfer model to GPU or CPU
    device = torch.device("cuda:0" if torch.cuda.is_avilable() else "cpu")
    model = model.to(device)
    
    loss_function = nn.CrossEntropyLoss()
    
    optimizer = optim.Adam(model.fc.parameters())
    
    test_accc = []
    
    for epoch in range(epochs):
        
    return model.eval()
    

In [156]:
model_train(5)

SqueezeNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (3): Fire(
      (squeeze): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (4): Fire(
      (squeeze): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (5): MaxPool2d

In [126]:
torch.set_grad_enabled(False);

In [137]:
model = torch.hub.load('pytorch/vision:v0.6.0', 'squeezenet1_1', pretrained=True)



Using cache found in /home/sunshine/.cache/torch/hub/pytorch_vision_v0.6.0


In [None]:
def train_model(epochs=10):
    model = torchvision.models.squeezenet1_1(pretrained=True)
    
    for param in model.parameters():
        param.requires_grad = False
        
    # plus in the DETR model layers
    
    
    

In [157]:
import cv2