# ResNet w/ RPN Heads for multi-tasking

In [1]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import transforms

In [2]:
model_test = torchvision.models.squeezenet1_1(pretrained=True)

In [4]:
# model_test.eval()

The fire module, in the forward function, is already concatenating the 1x1 and 3x3 expand layers. If we call on the Fire function or squeezenet, we should be able to use it.
In the github repo, they said that the final layer input is innitialized weirdly so now, I have to figure out how they do it. 

In [None]:
torch.cat([model_test.features[12].expand1x1.weight[0],
model_test.features[12].expand3x3.weight[0]], 1)

In [15]:
final_layer = model_test.features[12].expand1x1.weight

In [45]:
import torch
import torch.nn as nn
import torch.nn.init as init
from torch.utils.model_zoo import load_url as load_state_dict_from_url

__all__ = ['SqueezeNet', 'squeezenet1_0', 'squeezenet1_1']

model_urls = {
    'squeezenet1_0': 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth',
    'squeezenet1_1': 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth',
}


class Fire(nn.Module):

    def __init__(self, inplanes, squeeze_planes,
                 expand1x1_planes, expand3x3_planes):
        super(Fire, self).__init__()
        self.inplanes = inplanes
        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
        self.squeeze_activation = nn.ReLU(inplace=True)
        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
                                   kernel_size=1)
        self.expand1x1_activation = nn.ReLU(inplace=True)
        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
                                   kernel_size=3, padding=1)
        self.expand3x3_activation = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.squeeze_activation(self.squeeze(x))
        final_conv = torch.cat([
            self.expand1x1_activation(self.expand1x1(x)),
            self.expand3x3_activation(self.expand3x3(x))
        ], 1)
        return final_conv


class SqueezeNet(nn.Module):

    def __init__(self, version='1_0', num_classes=1000):
        super(SqueezeNet, self).__init__()
        self.num_classes = num_classes
        if version == '1_0':
            self.features = nn.Sequential(
                nn.Conv2d(3, 96, kernel_size=7, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(96, 16, 64, 64),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(512, 64, 256, 256),
            )
        elif version == '1_1':
            self.features = nn.Sequential(
                nn.Conv2d(3, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                Fire(128, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            )
        else:
            # FIXME: Is this needed? SqueezeNet should only be called from the
            # FIXME: squeezenet1_x() functions
            # FIXME: This checking is not done for the other models
            raise ValueError("Unsupported SqueezeNet version {version}:"
                             "1_0 or 1_1 expected".format(version=version))

        # Final convolution is initialized differently from the rest
        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            final_conv,
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m is final_conv:
                    init.normal_(m.weight, mean=0.0, std=0.01)
                else:
                    init.kaiming_uniform_(m.weight)
                if m.bias is not None:
                    init.constant_(m.bias, 0)
# would probably have to do something here 

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return torch.flatten(x, 1)


def _squeezenet(version, pretrained, progress, **kwargs):
    model = SqueezeNet(version, **kwargs)
    if pretrained:
        arch = 'squeezenet' + version
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model


def squeezenet1_0(pretrained=False, progress=True, **kwargs):
    r"""SqueezeNet model architecture from the `"SqueezeNet: AlexNet-level
    accuracy with 50x fewer parameters and <0.5MB model size"
    <https://arxiv.org/abs/1602.07360>`_ paper.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _squeezenet('1_0', pretrained, progress, **kwargs)


def squeezenet1_1(pretrained=False, progress=True, **kwargs):
    r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
    than SqueezeNet 1.0, without sacrificing accuracy.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _squeezenet('1_1', pretrained, progress, **kwargs)


In [46]:
model = squeezenet1_1(pretrained=True)

In [47]:
model.eval()

SqueezeNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2))
    (1): ReLU(inplace)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (3): Fire(
      (squeeze): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace)
    )
    (4): Fire(
      (squeeze): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace)
    )
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0

for param in model.parameters():
    print(len(param))

# Note
Look what to put in for the conv layer features from the SqueezeNet

In [77]:
conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)

In [69]:
# Don't necessarily need them 
class AuxiliaryConvolutions(nn.Module):
    """
    Additional Convolutions to produce higher-level feature maps.
    """
    def __init__(self):
        super(AuxiliaryConvolutions, self).__init__()
        self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1, padding=0)
        self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)

        self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1, padding=0)
        self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)

        self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)
        self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0)

        self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)
        self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0)
            
            # initialize the new parameters
            
        self.init_conv2d()
    
    def init_conv2d(self):
        """
        Initialize convolution parameters
        """
        
        for c in self.children():
            if isinstance(c, nn.Conv2d):
                nn.init.xavier_uniform_(c.weight)
                nn.init.constant_(c.bias, 0.)
                
    def forward(self, conv7_feats):
        """
        Forward the new Auxiliary conv layers
        """
        x = F.relu(self.conv8_1(model_test.features[12].expand1x1.weight))
        x= F.relu(self.conv8_2(x))
        conv8_2_feats = x
        
        x = F.relu(self.conv9_1(x))
        x = F.relu(self.conv9_2(x))
        conv9_2_feats = x
        
        x = F.relu(self.conv10_1(x))
        x = F.relu(self.conv10_2(x))
        conv10_2_feats = x
        
        x = F.relu(self.conv11_1(x))
        conv11_2_feats = F.relu(self.conv11_s(x))
        
        return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
    
    

In [70]:
model_aux = AuxiliaryConvolutions()

In [76]:
model_aux.conv8_2

Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))

# Important Part: Prediction Convolutions

Convolutions for predicting class scores nad bounding boxes

In [79]:
class PredictionConvolutions(nn.Module):
    """
    Convolutions to predict the class scores and bounding boxes using lower and higher-level feature maps.
    
    """
    def __init__(self, n_classes):
        """
        n_classes: Number of different types of objects
        """
        super(PredictionConvolutions, self).__init__()
        self.n_classes = n_classes 
        
        # since we want the boxes from multiple different feature maps
        # we will define the specific layers we want the boxes from
        n_boxes = {'conv8_2': 6,    # 8_2 and 9_2 are going to have 2 extra boxes with 3:1 and 1:3 aspect ratios
                   'conv9_2': 6,    # because the feature maps are huge from these layers
                   'conv10_2':4,
                   'conv11_2':4}
        
        # the four layers that we want the prior boxes from 
        # then, we are going to stack the feature maps on top of each other
        self.loc_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * 4, kernel_size = 3, padding=1)
        self.loc_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * 4, kernel_size = 3, padding =1)
        self.loc_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2'] * 4, kernel_size = 3, padding =1)
        self.loc_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2'] * 4, kernel_size=3, padding=1)
        
        # class probabilities predictions
        # for all the feature maps we have prior boxes for, we also want the class predictions
        self.cl_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * n_classes, kernel_size=3, padding=1)
        self.cl_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * n_classes, kernel_size=3, padding=1)
        self.cl_conv10_2 = nn.Conv2d(256, n_boxes['conv10_2'] * n_classes, kernel_size=3, padding=1)
        self.cl_conv11_2 = nn.Conv2d(256, n_boxes['conv11_2'] * n_classes, kernel_size=3, padding=1)
        
        # initialize the weights
        
        self.init_conv2d()
        
    def init_conv2d(self):
        """
        Simple function to initialize weights
        """
        for c in self.children():
            if isinstance(c, nn.Conv2d):
                nn.init.xavier_uniform_(c.weight)
                nn.init.constant_(c.bias, 0.)
                
    
    # let's define the forward functions:
    
    def forward(self, conv8_2, conv9_2, conv10_2, conv11_2):
        """
        Takes in the layers we defined in axuiliary convolutions and forwards them while getting the boxes outputs too
        
        """
        
        batch_size = conv8_2_feats.size(0)   # just taking a number from the feature map output by conv8_2 layer
        # seems like we want the number of the boxes output by the conv8_2 layer to be the same 
        
        # let's start with the first layer of the auxiliary convolutions:
        
        # perform the convolution on the feature map of the layer conv8_2 and get the output
        l_conv8_2 = self.loc_conv8_2(conv8_2_feats)    # Outputs: (N, 24, 10, 10)
        # now transform that output to be the same as prior-box order:
        # permute just switches the location of values in the tensor
        l_conv8_2 = l_conv8_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 24)
        # .contiguous()Returns a contiguous in memory tensor containing the same data as self tensor. 
        # If self tensor is already in the specified memory format, this function returns the self tensor.
        l_conv8_2 = l_conv8_2.view(batch_size, -1, 4) # 
        
        l_conv9_2 = self.loc_conv8_2(conv9_2_feats)    # Outputs: (N, 24, 10, 10)
        l_conv9_2 = l_conv9_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 24)
        l_conv9_2 = l_conv9_2.view(batch_size, -1, 4) # (N, 150, 4)
        
        l_conv10_2 = self.loc_conv8_2(conv10_2_feats)    # Outputs: (N, 24, 10, 10)
        l_conv10_2 = l_conv10_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 24)
        l_conv10_2 = l_conv10_2.view(batch_size, -1, 4) # (N, 36, 4)
        
        l_conv11_2 = self.loc_conv8_2(conv11_2_feats)    # Outputs: (N, 24, 10, 10)
        l_conv11_2 = l_conv11_2.permute(0, 2, 3, 1).contiguous() # (N, 10, 10, 24)
        l_conv11_2 = l_conv11_2.view(batch_size, -1, 4) # (N, 4, 4)
        
        # Classification
        # Predict classes for boxes i.e. Classification
        c_conv8_2 = self.cl_conv8_2(conv8_2_feats)
        c_conv8_2 = c_conv8_2.permute(0, 2, 3, 1).contiguous()
        c_conv8_2 = c_conv8_2.view(batch_size, -1, self.n_classes)
        
        c_conv9_2 = self.cl_conv9_2(conv9_2_feats)
        c_conv9_2 = c_conv9_2.permute(0, 2, 3, 1).contiguous()
        c_conv9_2 = c_conv9_2.view(batch_size, -1, self.n_classes)
        
        c_conv10_2 = self.cl_conv10_2(conv10_2_feats)
        c_conv10_2 = c_conv10_2.permute(0, 2, 3, 1).contiguous()
        c_conv10_2 = c_conv10_2.view(batch_size, -1, self.n_classes)
        
        c_conv11_2 = self.cl_conv11_2(conv11_2_feats)
        c_conv11_2 = c_conv11_2.permute(0, 2, 3, 1).contiguous()
        c_conv11_2 = c_conv11_2.view(batch_size, -1, self.n_classes)
        
        locs = torch.cat([l_conv8_2, l_conv9_2, l_conv10_2, l_conv11_2], dim=1)
        class_scores = torch.cat([c_conv8_2, c_conv9_2, c_conv10_2, c_conv11_2])
        
        return locs, class_scores
        

### Now comes the fun part where we try to get the network put together

In [85]:
class Squeeze_detect(nn.Module):
    """
    squeeze_detect network that encapsulates the base Squeezenet, auxiliary and prediction convs
    """
    def __init__(self, n_classes):
        super(Squeeze_detect, self).__init__()
        self.n_classes = n_classes
        
        self.base = SqueezeNet()
        self.aux_convs = AuxiliaryConvolutions()
        self.pred_convs = PredictionConvolutions(n_classes)
        
        self.rescale_factors = nn.Parameter(torch.FloatTensor(1, 512, 1, 1))    # there are 512 channels in 8_2 
        nn.init.constant_(self.rescale_factors, 20)
        
        # prior boxes 
        self.prior_cxcy = self.create_prior_boxes()
        
    def forward(self, image):
        """
        Forward Propagation.
        """
        
        # Do not need to do these because we are not really retraining stuff from squeezenet
       # conv8_2_feats = self.base(image)
        
        # rescale the conv8_2 after L2 norm
       # norm = conv8_2_feats.pow(2).sum(dim=1, keepdim=True).sqrt()
        # conv8_2_feats = conv8_2_feats / norm
        # conv8_2_feats = conv8_3_feats * self.rescale_factors
        
        # Auxiliary
        # higher level feature map generators
        # HERE it would be something like SqueezeNet.features[12].expand1x1.weights?
        conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats = self.aux_convs(conv7_feats)
        
        # run prediction convolutions
        locs, classes_scores = self.pred_convs(conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats)
        
        return locs, class_scores
    
    def create_prior_boxes(self):
        """
        create the ~ 8K numbers of priors but they would be less because we have less layers that we are doing
        calcs on
        """
        
        # the feature map dimensions for each layer, we want the end 
        fmap_dims = {'conv8_2': 10,
                     'conv9_2':5,
                     'conv10_2':3,
                     'conv11_2':1
                    }
    
        obj_scales = {'conv8_2':0.375,
                      'conv9_2':0.55,
                      'conv10_2':0.725,
                      'conv11_2':0.9                     
                     }
        aspect_ratios = {'conv8_2': [1., 2., 3., 0.5, .333],
                         'conv9_2':[1., 2., 3., 0.5, .333],
                         'conv10_2':[1., 2., 0.5],
                         'conv11_2':[1., 2., 0.5]}
    
        
        fmaps = list(fmap_dims.keys())
        
        prior_boxes = []
        
        for k, fmap in enumerate(fmaps):
            for i in range(fmaps_dims[fmap]):
                for j in range(fmaps_dims[fmap]):
                    cx = (j + 0.5) / fmap_dims[fmap]
                    cy = (i + 0.5) / fmap_dims[fmap]
                    
                    for ratio in aspect_ratios[fmap]:
                        prior_boxes.append([cx, cy, obj_scales[fmap] * sqrt(ratio), obj_scales[fmap] /sqrt(ratio)])
                        
                        if ratio == 1:
                            try:
                                additional_scale = sqrt(obj_scales[fmap] * obj_scales[fmaps[k+1]])
                                
                            except:
                                additional_scale = 1.
                                prior_boxes.append([cx, cy, additional_scale, additional_scale])
                                
        prior_boxes = torch.FloatTensor(prior_boxes).to(device)
        prior_boxes.clamp_(0, 1)
        
        return prior_boxes
    
    
        
        
        
        
        
        
        
        
        
        

In [80]:
        fmap_dims = {'conv8_2': 10,
                     'conv9_2':5,
                     'conv10_2':3,
                     'conv11_2':1
                    }

In [81]:
    fmaps = list(fmap_dims.keys())

In [83]:
for k, fmap in enumerate(fmaps):
    print(k)
    print(fmap)

0
conv8_2
1
conv9_2
2
conv10_2
3
conv11_2


In [62]:
cl = PredictionConvolutions(10)

If we put nn.ReLU in between the conv layers, we would call on nn.Module and type:
nn.ReLU(inplace=True)

BUT if we put in the forward function, we would call on the ReLU function directly and 

In [13]:
model.load_state_dict(torch.load('squeezenet1_0-a815701f.pth'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [15]:
model.eval()

SqueezeNet(
  (features): Sequential(
    (0): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
    (1): ReLU(inplace)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (3): Fire(
      (squeeze): Conv2d(96, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace)
    )
    (4): Fire(
      (squeeze): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace)
    )
    (5): Fire(
      (squeeze): Conv2d(128, 32, kerne

# DETR model head

After the squeezenet backbone, passing the image through the CNN, we pass in the feature pyramid to transformer encoder-decoder, which outputs a box of predictions:

1. CNN Backbone:
    - Input: Image
    - Output: Feature maps with multiple channels that are flattened to be passed into the encoder
2. Transformer encoder-decoder:
    - Encoder takes in the flattened layer and outputs the same size shape sequence.
    - Decoder takes in the output from the encoder but also takes in Object Queries
    #### Object Queries
        - Big N, represents the output we want i.e. if 4 tuples for 4 objects and their bounding boxes, we get that.
        - Each object query vector wouldask the image different questions like what is in your right box, left box, lower left box, lower right box.
3. Feed Forwad Network (FFN):


    
