##**Imports**

In [None]:
# Installs
!pip install torch
!pip install argparse

In [None]:
# Mounting google drive to load the datasets and the reid folders present in QAConv
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/NN/QAConv # If reproducing, change this to your own path

In [None]:
# Imports
import copy
import torch
import os
import sys
import argparse
import os.path as osp
import sys
import string
import time
import json
from torch import nn
from torch import Tensor
from torch.nn.init import xavier_uniform_
from torch.nn.modules import Module
from torch.nn.modules.container import ModuleList
from torch import einsum
from google.colab import drive
from __future__ import absolute_import
from typing import Optional, Any
from torch.nn import Module, ModuleList
import torchvision
from torch.nn.modules import TransformerEncoderLayer
from __future__ import print_function, absolute_import
from torch.backends import cudnn
import numpy as np
import scipy.io as sio
from torch.utils.data import DataLoader
from torchvision.transforms import InterpolationMode
from torch.optim.lr_scheduler import StepLR
from reid import datasets
from reid.trainers import Trainer
from reid.evaluators import Evaluator
from reid.utils.data import transforms as T
from reid.utils.data.preprocessor import Preprocessor
from reid.utils.logging import Logger
from reid.utils.serialization import load_checkpoint, save_checkpoint
from reid.utils.data.graph_sampler import GraphSampler
from reid.loss.pairwise_matching_loss import PairwiseMatchingLoss

##**TransformerDecoderLayer**
source : https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerDecoder

In [None]:
class TransformerDecoderLayer(Module):
   __constants__ = ['batch_first', 'norm_first']
   # using default values for d_model (512) and dim_feedforward (2048) as mentioned in the paper
   # def __init__(self, seq_len, , d_model: int = 512, dim_feedforward: int = 2048):
   def __init__(self, seq_len, d_model, dim_feedforward):
    super(TransformerDecoderLayer, self).__init__()
     # parameters
    self.seq_len = seq_len  # seq_len = hw
    self.d_model = d_model  # this is number of channels, will be 512

    # The prior score embeddings are learnable parameters of size hw × hw. They can also be considered
    # learnable weights, somewhat similar to the learnable FC weights
   
    prior_score_weight = torch.randn(self.seq_len, self.seq_len) #sixe hw x hw as mentioned in the paper
    # creating prior score as a learnable parameter
    self.learnable_prior_score_weight = nn.Parameter(prior_score_weight)  # prior store weights are added to the list of learnable parameters

    # Instantiating all the layer according to figure 1 in the paper
    self.fc1 = torch.nn.Linear(d_model, d_model, bias=True)     
    self.bn1 = torch.nn.BatchNorm1d(2*self.seq_len)   
    # 2 MLP heads exactly as explained in the paper  
    self.mlphead1 = torch.nn.Sequential(
                              torch.nn.Linear(2*self.seq_len, dim_feedforward),
                              torch.nn.BatchNorm1d(dim_feedforward),
                              torch.nn.ReLU()
                              )
    self.mlphead2 = torch.nn.Sequential(
                                         torch.nn.Linear(dim_feedforward, 1),
                                         torch.nn.BatchNorm1d(1))

   def forward(self, tgt: Tensor, memory: Tensor, prev_score: Tensor) -> Tensor:
    
    # tgt and memory are the output of the corresponding parallel encoder's query and gallery encodings
    # getting the value of parameters q k h w d as explained in the paper q,k - batch size, h - height, w-width and d- dimension
    q, h, w, d = tgt.size()
    k, h, w, d = memory.size()
   

    # Reshapig to prepare tgt and memory for matrix multioplication change tgt from q,h,w,d to q, h*w, d
    # ie changing q,h,w,d to q,t,d and k h w d to k s t
    tgt = tgt.view(q,-1,d)
    memory = memory.view(k,-1,d)
  

    # passing tgt and memory through the fully connected layer to get query and gallery as explained in the paper
    query = self.fc1(tgt)
    gallery = self.fc1(memory)

    # dot product (batched matrix multiplication) of query and gallery - taken from QA conv as explained in the paper
    mat_mul = einsum('q t d, k s d -> q k s t', query, gallery)
    #mat_mul = torch.matmul(query, gallery.t())
   

    # sigmoid of prior-score embedding
    prior_score_sig = self.learnable_prior_score_weight.sigmoid()

    # element wise multiplication of dot product and output of sigmoid
    final_score = mat_mul * prior_score_sig

    # Reshape (q,k,s,t) to (q*k, hw,hw) - as explained in the paper
    final_score = final_score.reshape(q*k, self.seq_len, self.seq_len)

    # GMP layer as it is from the QA conv as explained in the paper
    final_score = torch.cat((final_score.max(dim=1)[0], final_score.max(dim=2)[0]), dim=-1)
     
    #final_score = final_score.max(dim=-1)[0]         # our GMP implementation
    
    # Pass the score through each layer 
    final_score = self.bn1(final_score)
    final_score = self.mlphead1(final_score)
    final_score = self.mlphead2(final_score)
    final_score = final_score.view(q, k) # changing the dimesion from scalar to q,k because the loss functiond defined in QAConv requires this
    final_score += prev_score
    return final_score
    

##**Transmatcher class**

Similar to Transformer class implementation of pytorch
- Has encoder from pytorch (Transformer Encoder Layer)
- Our custom Decoder (Transformer Decoder Layer)

In [None]:
class TransMatcher(Module):
    def __init__(self, seq_len, d_model: int = 512, dim_feedforward: int = 2048):
        super(TransMatcher, self).__init__()
        self.seq_len = seq_len
        nhead = 1 # Only 1 head in encoder as defined in paper
        dropout = 0.
        # The paper used N-1 encoders and N decoders where the output of resnet50 is directly fed to the first decoder. N = 3 from paper
        self.encoder_layer1 = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout) #encoder layer 1
        self.encoder_layer2 = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout) #encoder layer 2
        self.decoder_layer1 = TransformerDecoderLayer(self.seq_len, d_model, dim_feedforward) #decoder layer 1
        self.decoder_layer2 = TransformerDecoderLayer(self.seq_len, d_model, dim_feedforward) #decoder layer 2
        self.decoder_layer3 = TransformerDecoderLayer(self.seq_len, d_model, dim_feedforward) #decoder layer 3
        self.reset_parameters() # from original implementation of transformers

    #present in original implementation
    def reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)
    
    def forward(self, query_en: Tensor, gallery_en: Tensor) -> Tensor:
       # q - length of query
       # k - length of gallery
       # h, w - size of feature map
       # d = channels
       print("Entering Transmatcher")
       q, h, w, d = query_en.size()
       k, h, w, d = gallery_en.size()

       # Reshaping to pass into encoder q,h,w,d to  hw, q, d
       query_en_re = query_en.view(q, -1, d).permute(1, 0, 2)
       gallery_en_re = gallery_en.view(k, -1, d).permute(1, 0, 2)

       # Encoder 1
       query_out_1 = self.encoder_layer1(query_en_re)
       gallery_out_1 = self.encoder_layer1(gallery_en_re)

       # Encoder 2
       query_out_2 = self.encoder_layer2(query_out_1)
       gallery_out_2 = self.encoder_layer2(gallery_out_1)
       
       # Decoder 1
       score_0 = torch.zeros(q,k).cuda() #torch.zeros(q*k, 1) - initial score is 0
       score_1 =  self.decoder_layer1(query_en, gallery_en, score_0)
       
       # Reshaping back to q, h, w, d to pass to decoder hw,q,d - q, h, w,d -
       query_out_re1 = query_out_1.permute(1, 0, 2).reshape(q, h, w, -1)
       gallery_out_re1 = gallery_out_1.permute(1, 0, 2).reshape(k, h, w, -1)

       # Decoder 2
       score_2 =  self.decoder_layer2(query_out_re1, gallery_out_re1, score_1)

       # Reshaping back to q, h, w, d to pass to decoder hw,q,d - q, h, w,d -
       query_out_re2 = query_out_2.permute(1, 0, 2).reshape(q, h, w, -1)
       gallery_out_re2 = gallery_out_2.permute(1, 0, 2).reshape(k, h, w, -1)

       # Decoder 3
       score_out =  self.decoder_layer3(query_out_re2, gallery_out_re2, score_2)
       return score_out

## **Resnet50-ibn-b Class**
This class contains the resnet50-ibn-b from https://github.com/XingangPan/IBN-Net and the 3x3 neck convolution as explained in the paper
- The query and gallery images are passed through the resnet50-ibn-b layer 3 (output dimensions 1024) and the corresponding feature embeddings are passed through an additional 3x3 neck convolution layer (output dimensions - 512) before being passed to the first encoder and the first decoder


In [None]:
class ResnetConv(Module):
    def __init__(self, model):
        super(ResnetConv, self).__init__()
        self.neck_conv = nn.Conv2d(1024, 512, kernel_size=3, padding=1) # 3x3 neck convolution layer
        self.model = model
    def forward(self, inputs):
        x = inputs
        # getting the feature embeddings
        for name, module in self.model._modules.items():
            x = module(x)
            if name == 'layer3':
                break
        # passing the feature embeddings to the 3x3 neck convolution layer
        x = self.neck_conv(x)
        x = x.permute(0,2,3,1) # changing order from q,d,h,w to q, h,w,d to pass to decoder and encoder since output of resnet is q,d,h,w
        return x

##**Function to get training data and test data**
These function definitions are reused from the paper implementation

In [None]:
def get_data(dataname, data_dir, model, matcher, save_path, args):
    root = osp.join(data_dir, dataname)

    dataset = datasets.create(dataname, root, combine_all=False)

    num_classes = dataset.num_train_ids

    train_transformer = T.Compose([
        T.Resize((args.height, args.width), interpolation=InterpolationMode.BICUBIC),
        T.Pad(10),
        T.RandomCrop((args.height, args.width)),
        T.RandomHorizontalFlip(0.5),
        T.RandomRotation(5), 
        T.ColorJitter(brightness=(0.5, 2.0), contrast=(0.5, 2.0), saturation=(0.5, 2.0), hue=(-0.1, 0.1)),
        T.RandomOcclusion(args.min_size, args.max_size),
        T.ToTensor(),
    ])

    test_transformer = T.Compose([
        T.Resize((args.height, args.width), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
    ])

    train_path = osp.join(dataset.images_dir, dataset.train_path)
    train_loader = DataLoader(
        Preprocessor(dataset.train, root=train_path, transform=train_transformer),
        batch_size=args.batch_size, num_workers=args.workers,
        sampler=GraphSampler(dataset.train, train_path, test_transformer, model, matcher, args.batch_size, args.num_instance,
                    args.test_gal_batch, args.test_prob_batch, save_path, args.gs_verbose),
        pin_memory=True)

    query_loader = DataLoader(
        Preprocessor(dataset.query,
                     root=osp.join(dataset.images_dir, dataset.query_path), transform=test_transformer),
        batch_size=args.test_fea_batch, num_workers=args.workers,
        shuffle=False, pin_memory=True)

    gallery_loader = DataLoader(
        Preprocessor(dataset.gallery,
                     root=osp.join(dataset.images_dir, dataset.gallery_path), transform=test_transformer),
        batch_size=args.test_fea_batch, num_workers=args.workers,
        shuffle=False, pin_memory=True)

    return dataset, num_classes, train_loader, query_loader, gallery_loader


In [None]:
def get_test_data(dataname, data_dir, height, width, workers=8, test_batch=64):
    root = osp.join(data_dir, dataname)

    dataset = datasets.create(dataname, root, combine_all=False)

    test_transformer = T.Compose([
        T.Resize((height, width), interpolation=3),
        T.ToTensor(),
    ])

    query_loader = DataLoader(
        Preprocessor(dataset.query,
                     root=osp.join(dataset.images_dir, dataset.query_path), transform=test_transformer),
        batch_size=test_batch, num_workers=workers,
        shuffle=False, pin_memory=True)

    gallery_loader = DataLoader(
        Preprocessor(dataset.gallery,
                     root=osp.join(dataset.images_dir, dataset.gallery_path), transform=test_transformer),
        batch_size=test_batch, num_workers=workers,
        shuffle=False, pin_memory=True)

    return dataset, query_loader, gallery_loader

##**Training Loop**
We have reused parts of this code from QAConv-GS and modified to it run our chosen experiment from the list of experiments provided in the paper


In [None]:
def train(args):

    # Initializations
    working_dir = os.path.dirname(os.path.abspath("OurTransformerDecoder.ipynb"))
    dataset='randperson'
    exp_dir=osp.join(working_dir, 'Exp')
    data_dir=osp.join(working_dir, 'data')
    method='TransMatcher'
    sub_method='res50-ibnb-layer3'
    testset='cuhk03_np_detected'
    batch_size=64
    workers=8
    height=384
    width=128
    lr = 0.005
    lr_stepped = False
    clip_value = 4
    #loss_list = []
    #accuracy = []
    # gpu stuff
    cudnn.deterministic = False
    cudnn.benchmark = True

    # log files
    exp_database_dir = osp.join(exp_dir, string.capwords(dataset))
    output_dir = osp.join(exp_database_dir, method, sub_method)
    log_file = osp.join(output_dir, 'log.txt')

    # Arguments for TransMatcher class
    seq_len = 24 * 8
    d_model = 512
    dim_feedforward = 2048

    # Calling Transmatcher method
    matcher = TransMatcher(seq_len, d_model, dim_feedforward).cuda()
    # Resnet
    resnet50_ibn = torch.hub.load('XingangPan/IBN-Net', 'resnet50_ibn_b', pretrained=True)    
    # Criterion - Thier own loss function which we have taken from QAConv-GS as explained in the paper
    backbone = ResnetConv(resnet50_ibn).cuda()
    criterion = PairwiseMatchingLoss(matcher).cuda()

    # Optimizer - https://pytorch.org/docs/stable/optim.html
    # taken from authors code but made a few modifications
    base_param_ids = set(map(id, backbone.parameters()))
    new_params = [p for p in matcher.parameters() if
                  id(p) not in base_param_ids]
    param_groups = [
        {'params': backbone.parameters(), 'lr': 0.0005} ,# The have specified in paper, lr for backbone network is 0.0005
        {'params': new_params, 'lr': 0.005}]
        #{'params': matcher.parameters(), 'lr': 0.005}

    optimizer = torch.optim.SGD(param_groups, lr=lr, momentum=0.9)

    # Load from checkpoint
    start_epoch = 0

    # parallel computation
    model_bb = nn.DataParallel(backbone).cuda()
    save_path = None

    # Dataloader
    dataset, num_classes, train_loader, _, _ = get_data(dataset, data_dir, model_bb, matcher, save_path, args)

    # Training starts
    # Trainer
    trainer = Trainer(model_bb, criterion, clip_value)
    t0 = time.time()

    # Start training - 15 epochs as mentioned in the paper
    for epoch in range(start_epoch, 4):
    # Decay learning rate by 0.1 after 10 epochs
            if epoch == 2:
                print('Decay the learning rate by a factor of 0.1.')
                for group in optimizer.param_groups:
                    group['lr'] *= 0.1
            loss, acc = trainer.train(epoch, train_loader, optimizer)
            print(loss)
            #loss_list.append(loss) #adding loss to list
            #accuracy.append(acc)
            lr = list(map(lambda group: group['lr'], optimizer.param_groups))
            #lr_scheduler.step()
            optimizer.step()
            train_time = time.time() - t0
            epoch1 = epoch + 1

            print(
                '* Finished epoch %d at lr=[%g, %g]. Loss: %.3f. Acc: %.2f%%. Training time: %.0f seconds.\n'
                % (epoch1, lr[0], lr[1], loss, acc * 100, train_time)) #lr[2] %g
            s = 'checkpoint_' + str(epoch1) + '.pth.tar'   
            save_checkpoint({
                'model': model_bb.module.state_dict(),
                'criterion': criterion.state_dict(),
                'optim': optimizer.state_dict(),
                'epoch': epoch1,
            }, fpath=osp.join(output_dir, s))
            
            # Saving results in JSON file        
            json_file = osp.join(output_dir, 'results.json')
            arg_dict = {'train_dataset': args.dataset, 'exp_dir': args.exp_dir, 'method': args.method, 'sub_method': args.sub_method}
            with open(json_file, 'a') as f:
                  json.dump(arg_dict, f)
                  f.write('\n')
            train_dict = {'train_dataset': args.dataset, 'loss': loss, 'acc': acc, 'epochs': epoch1, 'train_time': train_time}
            with open(json_file, 'a') as f:
                  json.dump(train_dict, f)
                  f.write('\n')
    # Training loop ends here

    print('Finished training at epoch %d, loss = %.3f, acc = %.2f%%.\n'
              % (epoch1, loss, acc * 100))
    print("Total training time: %.3f sec. Average training time per epoch: %.3f sec." % (
            train_time, train_time / (epoch1 - start_epoch)))
 




## **Testing loop**

Certain parts reused from QAConv-GS and modified

In [None]:
# eval

def eval_loop(args):
  cudnn.deterministic = False
  cudnn.benchmark = True
  print('Evaluate the learned model:')
  # log files - done
  exp_database_dir = osp.join(args.exp_dir, string.capwords(args.dataset))
  output_dir = osp.join(exp_database_dir, args.method, args.sub_method)
  log_file = osp.join(output_dir, 'log.txt')
  lr = 0.005

  # Arguments for TransMatcher class - done
  seq_len = 24 * 8
  d_model = 512
  dim_feedforward = 2048
  # Loading checkpoint
  checkpoint = load_checkpoint(osp.join(output_dir, 'checkpoint_15.pth.tar'))
 
  # Calling Transmatcher method
  matcher = TransMatcher(seq_len, d_model, dim_feedforward).cuda()
    # Resnet
  resnet50_ibn = torch.hub.load('XingangPan/IBN-Net', 'resnet50_ibn_b', pretrained=True)    
    # In the paper they have used 3x3 neck convolution layer which is appended to the resnet50 ibn for further feature extraction
    #neck_conv = nn.Conv2d(1024, 512, kernel_size=3, padding=1) #educated guess 512
    # Criterion - Thier own loss function which we have taken from QAConv-GS as explained in the paper
  backbone = ResnetConv(resnet50_ibn).cuda()
  criterion = PairwiseMatchingLoss(matcher).cuda()
  
  save_path = None
    # Optimizer - https://pytorch.org/docs/stable/optim.html
    # taken from authors code but made a few modifications
  base_param_ids = set(map(id, backbone.parameters()))
  new_params = [p for p in matcher.parameters() if
                  id(p) not in base_param_ids]
  param_groups = [
      {'params': backbone.parameters(), 'lr': 0.0005} ,# The have specified in paper, lr for backbone network is 0.0005
      {'params': new_params, 'lr': 0.005}]
        #{'params': matcher.parameters(), 'lr': 0.005}

  optimizer = torch.optim.SGD(param_groups, lr=lr, momentum=0.9)

  backbone.load_state_dict(checkpoint['model'])
  criterion.load_state_dict(checkpoint['criterion'])
  optimizer.load_state_dict(checkpoint['optim'])

  model_bb = nn.DataParallel(backbone).cuda()
  # Evaluator
  evaluator = Evaluator(model_bb)
  t0 = time.time()
  test_names = args.testset.strip().split(',')
  for test_name in test_names:
      if test_name not in datasets.names():
          print('Unknown dataset: %s.' % test_name)
          continue

  t1 = time.time()
  testset, test_query_loader, test_gallery_loader = \
    get_test_data(test_name, args.data_dir, args.height, args.width, args.workers, args.test_fea_batch)

  do_tlift = False
  if not do_tlift:
    testset.has_time_info = False

  test_rank1, test_mAP, test_rank1_rerank, test_mAP_rerank, test_rank1_tlift, test_mAP_tlift, test_dist, \
    test_dist_rerank, test_dist_tlift, pre_tlift_dict = \
        evaluator.evaluate(matcher, testset, test_query_loader, test_gallery_loader, 
                                  args.test_gal_batch, args.test_prob_batch,
                                args.tau, args.sigma, args.K, args.alpha)

  test_time = time.time() - t1

  test_dict = {'test_dataset': test_name, 'rank1': test_rank1, 'mAP': test_mAP, 'test_time': test_time}
  print('  %s: rank1=%.1f, mAP=%.1f.\n' % (test_name, test_rank1 * 100, test_mAP * 100))

  json_file = osp.join(output_dir, 'eval.json')
  with open(json_file, 'a') as f:
      json.dump(test_dict, f)
      f.write('\n')

  test_time = time.time() - t0

## **Main function to call train and eval loop**

In [None]:
%%capture

import os
import sys
if __name__ == '__main__':
# pass command line arguments directly in the code cell
# epoch - 15 for market dataset and 4 for randperson as metioned in the paper
# All the values of the arguments specified here are as exactly in the paper
  working_dir = os.path.dirname(os.path.abspath("OurTransformerDecoder.ipynb"))
  args = argparse.Namespace(dataset='randperson', testset='cuhk03_np_detected', batch_size=64, workers=8, height=384, width=128,
                            min_size=0, max_size=0.8, lr=0.005, epochs=4, step_size=2, clip_value=4, tau=100, sigma=200, K=10, alpha=0.2,
                            num_instance=4, evaluate=False, test_fea_batch=256, test_gal_batch=128, test_prob_batch=128, data_dir=osp.join(working_dir, 'data'),
                            exp_dir=osp.join(working_dir, 'Exp'), method='TransMatcher', sub_method='res50-ibnb-layer3', arch='resnet50',resume='', gs_save=False, combine_all=False, gs_verbose=False)

  train(args)
  eval_loop(args)