##**Imports**

In [None]:
# Installs
#!pip install torch
#!pip install argparse

#mounting google drive to load the datasets and the reid folders present in QAConv
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/NN/QAConv

# Imports
import copy
import torch
import os
import sys
import argparse
import os.path as osp
import sys
import string
import time
import json
from torch import nn
from torch import Tensor
from torch.nn.init import xavier_uniform_
from torch.nn.modules import Module
from torch.nn.modules.container import ModuleList
from torch import einsum
from google.colab import drive
from __future__ import absolute_import
from typing import Optional, Any



from torch.nn import Module, ModuleList
import torchvision
from torch.nn.modules import TransformerEncoderLayer
from __future__ import print_function, absolute_import
from torch.backends import cudnn
import numpy as np
import scipy.io as sio


from torch.utils.data import DataLoader
from torchvision.transforms import InterpolationMode
from torch.optim.lr_scheduler import StepLR

#sys.path.append('QAConv')
from reid import datasets
from reid.trainers import Trainer
from reid.evaluators import Evaluator
from reid.utils.data import transforms as T
from reid.utils.data.preprocessor import Preprocessor
from reid.utils.logging import Logger
from reid.utils.serialization import load_checkpoint, save_checkpoint

from reid.utils.data.graph_sampler import GraphSampler
from reid.loss.pairwise_matching_loss import PairwiseMatchingLoss

Mounted at /content/drive/
/content/drive/MyDrive/NN/QAConv


##**TransformerDecoderLayer**
source : https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerDecoder

In [None]:
class TransformerDecoderLayer(Module):
   __constants__ = ['batch_first', 'norm_first']
   # using default values for d_model (512) and dim_feedforward (2048) as mentioned in the paper
   def __init__(self, seq_len, d_model: int = 512, dim_feedforward: int = 2048):
    super(TransformerDecoderLayer, self).__init__()
     # parameters
    self.seq_len = seq_len
    self.d_model = d_model

    # The prior score embeddings are learnable parameters of size hw × hw. They can also be considered
    # learnable weights, somewhat similar to the learnable FC weights
   
    prior_score_weight = torch.randn(self.seq_len, self.seq_len)
    # creating prior score as a learnable parameter
    prior_score_weight = prior_score_weight.view(1,1,self.seq_len, self.seq_len)
    self.learnable_prior_score_weight = nn.Parameter(prior_score_weight)

    # Instantiating all the layer according to figure 1 in the paper
    self.fc1 = torch.nn.Linear(d_model, d_model, bias=True)
    self.bn1 = torch.nn.BatchNorm1d(2*self.seq_len)
    self.mlphead = torch.nn.Sequential(
                              torch.nn.Linear(2*self.seq_len, dim_feedforward),
                              torch.nn.BatchNorm1d(dim_feedforward),
                              torch.nn.ReLU(),
                              torch.nn.Linear(dim_feedforward, 1))
    self.bn3 = torch.nn.BatchNorm1d(1)
    self.relu = torch.nn.ReLU()

    # from the paper - we use shared FC parameters for both query and gallery, because they are exchangeable in the image 
    # matching task, and the similarity metric needs to be symmetrically defined - Does this mean same set of weights for both query and gallery  or same weights for all linear layers ?

   def forward(self, tgt: Tensor, memory: Tensor) -> Tensor:
    
    # tgt -> input to the decoder (assuming this is the output of the corresponding encoder layer)
    # memory -> input from the last encoder layer 
    # getting the value of parameters q k h w d as explained in the paper
    q, h, w, d = tgt.size()
    k, h, w, d = memory.size()

    # Reshapig to prepare tgt and memory for matrix multioplication change tgt from q,h,w,d to q, h*w, d
    # ie changing q,h,w,d to q,t,d
    tgt = tgt.view(q,-1,d)
    memory = memory.view(k,-1,d)

    # passing tgt and memory through the fully connected layer to get query and gallery as explained in the paper
    query = self.fc1(tgt)
    gallery = self.fc1(memory)

    # dot product (batched matrix multiplication) of query and gallery - taken from QA conv as explained in the paper
    mat_mul = einsum('q t d, k s d -> q k s t', query, gallery)

    # sigmoid of prior-score embedding
    score_sig = self.learnable_prior_score_weight.sigmoid()

    # element wise multiplication of dot product and output of sigmoid
    # Each element of score contains a pairwise similarity score between a position in the query sequence and a position in the key sequence
    final_score = mat_mul * score_sig

    # Reshape (q,k,s,t) to (q*k, self.seq_len, self.seq_len)
    final_score = final_score.reshape(q*k, self.seq_len, self.seq_len)

    # GMP layer as it is from the QA conv as explained in the paper
    final_score = torch.cat((final_score.max(dim=1)[0], final_score.max(dim=2)[0]), dim=-1)

    # Pass the score through each layer 
    final_score = self.bn1(final_score)
    final_score = self.mlphead(final_score)
    final_score = self.bn3(final_score)
    final_score = final_score.view(q, k) # check this - needs to be a scalar - from thier Transmatcher
    return final_score
    

##**Transformer Decoder**
TransformerDecoder is a stack of N decoder layers

In [None]:
class TransformerDecoder1(Module):
  # All this code is from the actual pytorch implementation of Transformer Decoder class
   __constants__ = ['norm']
   def __init__(self, decoder_layer, num_layers, norm=None):
     super(TransformerDecoder, self).__init__()
     self.layers = _get_clones(decoder_layer, num_layers)
     self.num_layers = num_layers
     self.norm = norm
   def forward(self, tgt: Tensor, memory: Tensor) -> Tensor:
      output = tgt
      print(self.layers)
   # This did no work (without chunks) - i am doing something wrong
      for i, mod in enumerate(self.layers):
          if i == 0:
            output = mod(output, memory)
          else:
            output = output + mod(output, memory) 

      if self.norm is not None:
          output = self.norm(output)
      return output

In [None]:
class TransformerDecoder(torch.nn.Module):
  def __init__(self, decoder, num_of_layers, norm=None):
    __constants__ = ['norm'] #present in the original code
    super(TransformerDecoder, self).__init__()
  # creating decoder stacks
    self.num_of_layers = num_of_layers
    self.layers = _get_clones(decoder, num_of_layers)
    self.norm = norm  

  # tgt - input to decoder (I think this is the output of each encoder parallel to decoder)
  # memory - output from last encoder
  def forward(self, tgt: Tensor, memory: Tensor) -> Tensor:
    
    tgt = tgt.chunk(self.num_of_layers, dim = -1)
    memory = memory.chunk(self.num_of_layers, dim = -1)
    # Using enumerate to keep track of the indices as well
    # This for loop is for saving the scores. for the first layer (when i ==0 ) the score is just the output of the decoder module, for the
    # subsequent layers it is the output of the previous decoder (as explained in the paper where they add score with score n-1)
    
    for i, layer in enumerate(self.layers):
      if i == 0:
        score = layer(tgt[i], memory[i])
      else:
        score = score + layer(tgt[i], memory[i]) # If the layer is not the first layer then add the output of MLP head with previous layer score
    # This code without chunks doesnt work. itthrows mat multiplication error. we need to inspect why
    '''
    score = 0
    for layer in self.layers:
       score = score + layer(tgt, memory)
    '''
    if self.norm is not None:
      q, k = score.size()
      # reshaping before passing to normalization as I faced errors if I did not
      score = score.view(-1, 1)
      score = self.norm(score)
      # back to original shape after normalization
      score = score.view(q, k)
    return score


##**TransformerEncoder**

Stack of Transformerencoders

In [None]:
# Similar to TransformerEncoder in official implementation
# try to remove clones and replace with 2 encoder layers - encoderlayer1 and encoderlayer2, encoderlayer2 ouput passed to decoder 1 2 3
class TransformerEncoder(Module):
  __constants__ = ['norm']
  def __init__(self, encoder_layer, num_layers, norm=None, enable_nested_tensor=False, mask_check=False):
      super().__init__()
      self.layers = _get_clones(encoder_layer, num_layers)
      self.num_layers = num_layers
      self.norm = norm
      self.enable_nested_tensor = enable_nested_tensor
      self.mask_check = mask_check
    
  # forward method - passing the input through the encoders - pass th output of encoder1 to encoder2 and decoder2
  def forward(self, src: Tensor, mask: Optional[Tensor] = None, key_mask: Tensor, is_causal: Optional[bool] = None) -> Tensor:
      output = src
      for mod in self.layers:
        output = mod(output, mask, key_mask)
      if self.norm is not None:
        output = self.norm(output)
      return output
  
# function definition from original Transformer Decoder implementation
def _get_clones(module, N):
    return ModuleList([copy.deepcopy(module) for i in range(N)])

##**TransMatcher class - Similar to Transformer class**
This class will instantiate decoder, encoder and resnet50 as first encoder

Reference - https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerEncoder

In [None]:
# Resnet as first encoder as explained in the paper
# Dimensions for the layers in Resnet 50 - The same used in the paper
feature_dims = {'layer2': 512, 'layer3': 1024, 'layer4': 2048}

class TransMatcher(Module):
    def __init__(self, seq_len, d_model: int = 512, nhead: int = 1, num_encoder_layers: int = 2, num_decoder_layers: int = 3, dim_feedforward: int = 2048,
                 final_layer='layer3', neck: int = 512, dropout: float = 0.):
          super(TransMatcher, self).__init__()
          self.memory = None
          self.seq_len = seq_len
          self.d_model = d_model
          self.final_layer = final_layer #hardcode?
          self.nhead = nhead
          self.encoder = None
          #resnet50 with ibn as explained in the paper - https://github.com/XingangPan/IBN-Net
          self.backbone = torch.hub.load('XingangPan/IBN-Net', 'resnet50_ibn_b', pretrained=True)
          self.reset_parameters()

          # setting the feature dimensions for layer 3 of resnet
          layer3_fea = 1024 # for layer3

          # In the paper they have used 3x3 neck convolution layer which is appended to the resnet50 ibn for further feature extraction
          self.neck_conv = nn.Conv2d(layer3_few, neck, kernel_size=3, padding=1)
          
          # encoders
          encoder_layer = TransformerEncoderLayer(self.d_model, self.nhead, dim_feedforward, dropout)
          encoder_norm = None # None in author's implementation
          self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)

          # decoders
          self.decoder_layer = TransformerDecoderLayer(seq_len, d_model, dim_feedforward)
          decoder_norm = None # play around witgh later as authors have batchnorm here
          self.decoder = TransformerDecoder(self.decoder_layer, num_decoder_layers, decoder_norm)

    #present in original implementation
     def reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)
        
    # Not sure how to implement output of each encoder given to respective decoder
     def forward(self, src: Tensor, tgt: Tensor) -> Tensor:
          # passing query and gallery through the backbone network independently as explained in the paper
          query = src
          gallery = tgt

          # backbone network - to get the feature map
          for name, module in self.backbone._modules.items():
            query = module(query)
            gallery = module(gallery)
            if name == self.final_layer:
                break

          # 3x3 convolution neck:
          query = self.neck_conv(query)
          gallery = self.neck_conv(gallery)

          # passing it to encoder
          #inputs = torch.cat(query, gallery) # since encoderlayer forward function is taking only 1 input (check official implementation)
          #memory = self.encoder(inputs) # output of last encoder
          # OR pass it seprately thro the encoder? - i think this makes sense from the diagram in the paper
          out_query = self.encoder(query)
          out_gallery = self.encoder(gallery)
         
          
          # how will we split query and gallery again here ????
          # passing to decoder (this is confusing as well)
          score = self.decoder(out_query, out_gallery) 
          return score



##**TransMatcher class to call the decoder - This is based on paper DONT USE THIS**

Similar to Transformer class in original pytorch implementation of Transformers

In [None]:
# TransMatcher class to create the decoder
class TransMatcher(Module):
  def __init__(self, seq_len, d_model=512, num_decoder_layers=3, dim_feedforward=2048, final_layer='layer3', neck=512, nhead=1, dropout: float = 0.):
          super(TransMatcher, self).__init__()
          self.decoder_layer = TransformerDecoderLayer(seq_len, d_model, dim_feedforward)
          decoder_norm = nn.BatchNorm1d(1)
          self.decoder = TransformerDecoder(self.decoder_layer, num_decoder_layers, decoder_norm)
          self.memory = None
          self.seq_len = seq_len
          self.d_model = d_model
          self.base = torch.hub.load('XingangPan/IBN-Net', 'resnet50_ibn_b', pretrained=True)
          self.reset_parameters()

#present in original implementation
  def reset_parameters(self):
      for p in self.parameters():
                  if p.dim() > 1:
                      xavier_uniform_(p)

  # present in QA Conv - just mapping memory to gallery
  def make_kernel(self, features):
      self.memory = features

  def forward(self, features):
      score = self.decoder(self.memory, features)
      return score


##**Main function to test**

In [None]:
# This is just a test function to see if the above implementation works and it does
if __name__ == "__main__":
    import time
    model = TransMatcher(24*8, 512, 3).eval()
    gallery = torch.rand((32, 24, 8, 512*3))
    probe = torch.rand((16, 24, 8, 512*3))

    start = time.time()
    model.make_kernel(gallery)
    out = model(probe)
    print(out.size())
    end = time.time()
    print('Time: %.3f seconds.' % (end - start))

    start = time.time()
    model.make_kernel(probe)
    out2 = model(gallery)
    print(out2.size())
    end = time.time()
    print('Time: %.3f seconds.' % (end - start))
    out2 = out2.t()
    print((out2 == out).all())
    print((out2 - out).abs().mean())
    print(out[:4, :4])
    print(out2[:4, :4])

torch.Size([32, 16])
Time: 2.366 seconds.
torch.Size([16, 32])
Time: 2.945 seconds.
tensor(False)
tensor(3.0053, grad_fn=<MeanBackward0>)
tensor([[55.0166, 59.3318, 57.9900, 53.6307],
        [56.9376, 61.9847, 58.2995, 57.2540],
        [56.7064, 62.1524, 58.6920, 57.0373],
        [54.7663, 57.7237, 55.7484, 54.7164]], grad_fn=<SliceBackward0>)
tensor([[56.8486, 58.4968, 58.7244, 52.1372],
        [61.8187, 65.9041, 61.4339, 59.3565],
        [53.7679, 58.8465, 56.2316, 51.6803],
        [52.6982, 55.1715, 55.3523, 49.5685]], grad_fn=<SliceBackward0>)


##**Transformer Encoder & Resnt50IBN class**

We have reused parts of code from the author's implementation and modified it to use just Resnet50withIBN since we have chosen to reproduce this experiment

In [None]:
# Proper Implementation

# Transformer Encoder
# From pytorch official documentation
# https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html
# https://pytorch.org/tutorials/beginner/transformer_tutorial.html

class TransformerEncoder(torch.nn.Module):
    def __init__(self, encoder_layer, num_layer, norm=None):
      super(TransformerEncoder, self).__init__()
      self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for i in range(num_layer)])
      self.num_layer = num_layer
      self.norm = norm

    # src - input sequence to the encoder src: Tensor, shape [seq_len, batch_size]
    # src_mask: Tensor, shape [seq_len, seq_len]
    # key_mask: The mask for the keys per batch
    def forward(self, src: Tensor, src_mask: Tensor, key_mask: Tensor) -> Tensor:
      out = src
      out_list = []
      for layer in self.layers:
        out = layer(out, src_mask=src_mask, key_mask=key_mask)
        out_list.append(out)
      # applying normalization to the outputs if norm is not None
      if self.norm is not None:
        for i in range(len(out_list)):
          out[i] = self.norm(out[i])
      # concatenate the outputs into 1 tensor
      output = torch.cat(out_list, dim=-1)
      return output

In [None]:
# Resnet as first encoder as explained in the paper
# Dimensions for the layers in Resnet 50 - The same used in the paper
feature_dims = {'layer2': 512, 'layer3': 1024, 'layer4': 2048}
class ResnetIBN(nn.Module):

  # final_layer - default value is layer3 which is the third layer in resnet50
  # nhead - number of multi head attention in the encoder
  def __init__(self, final_layer='layer3', neck=512, nhead=1, num_encoder_layers=2, dim_feedforward=2048, dropout=0., pretrained=True):
    super(ResnetIBN, self).__init__()
    self.final_layer = final_layer
    self.neck = neck
    self.pretrained = pretrained
    #resnet50 with ibn as explained in the paper - https://github.com/XingangPan/IBN-Net
    self.base = torch.hub.load('XingangPan/IBN-Net', 'resnet50_ibn_b', pretrained=True)

    # setting the feature dimensions for layer 3 of resnet
    layer3_fea = feature_dims[final_layer]

    # The purpose of the neck layers is to further refine the features extracted by the earlier layers, and to reduce the dimensionality of the feature vectors to a level that is suitable for the task at hand
    # In the paper they have used 3x3 neck convolution layer which is appended to the resnet50 ibn for further feature extraction
    if neck > 0:
        self.neck_conv = nn.Conv2d(layer3_fea, neck, kernel_size=3, padding=1)
        embeddings = neck

    self.encoder = None
    encoder_layer = TransformerEncoderLayer(embeddings, nhead, dim_feedforward, dropout)
    encoder_norm = None
    # stacking encoders (in this case 2 encoders)
    self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
    self.num_features = embeddings

  def forward(self, inputs):
        # passing the inputs through the backbone layer to get the feature encodings
        x = inputs
        for name, module in self.base._modules.items():
            x = module(x)
            if name == self.final_layer:
                break

        if self.neck > 0:
            x = self.neck_conv(x)

        # passing the feature encodings to rest of the encoders.
        out = x.permute(0, 2, 3, 1)  # [b, h, w, c]
        if self.encoder is not None:
            b, c, h, w = x.size()
            y = x.view(b, c, -1).permute(2, 0, 1)  # [hw, b, c]
            y = self.encoder(y)
            y = y.permute(1, 0, 2).reshape(b, h, w, -1)  # [b, h, w, c]
            out = torch.cat((out, y), dim=-1)

        return out

##**Function to get training data and test data**
These function definitions are reused from the paper implementation
WE CAN USE THIS

In [None]:
def get_data(dataname, data_dir, model, matcher, save_path, args):
    root = osp.join(data_dir, dataname)

    dataset = datasets.create(dataname, root, combine_all=args.combine_all)

    num_classes = dataset.num_train_ids

    train_transformer = T.Compose([
        T.Resize((args.height, args.width), interpolation=InterpolationMode.BICUBIC),
        T.Pad(10),
        T.RandomCrop((args.height, args.width)),
        T.RandomHorizontalFlip(0.5),
        T.RandomRotation(5), 
        T.ColorJitter(brightness=(0.5, 2.0), contrast=(0.5, 2.0), saturation=(0.5, 2.0), hue=(-0.1, 0.1)),
        T.RandomOcclusion(args.min_size, args.max_size),
        T.ToTensor(),
    ])

    test_transformer = T.Compose([
        T.Resize((args.height, args.width), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
    ])

    train_path = osp.join(dataset.images_dir, dataset.train_path)
    train_loader = DataLoader(
        Preprocessor(dataset.train, root=train_path, transform=train_transformer),
        batch_size=args.batch_size, num_workers=args.workers,
        sampler=GraphSampler(dataset.train, train_path, test_transformer, model, matcher, args.batch_size, args.num_instance,
                    args.test_gal_batch, args.test_prob_batch, save_path, args.gs_verbose),
        pin_memory=True)

    query_loader = DataLoader(
        Preprocessor(dataset.query,
                     root=osp.join(dataset.images_dir, dataset.query_path), transform=test_transformer),
        batch_size=args.test_fea_batch, num_workers=args.workers,
        shuffle=False, pin_memory=True)

    gallery_loader = DataLoader(
        Preprocessor(dataset.gallery,
                     root=osp.join(dataset.images_dir, dataset.gallery_path), transform=test_transformer),
        batch_size=args.test_fea_batch, num_workers=args.workers,
        shuffle=False, pin_memory=True)

    return dataset, num_classes, train_loader, query_loader, gallery_loader


In [None]:
def get_test_data(dataname, data_dir, height, width, workers=8, test_batch=64):
    root = osp.join(data_dir, dataname)

    dataset = datasets.create(dataname, root, combine_all=False)

    test_transformer = T.Compose([
        T.Resize((height, width), interpolation=3),
        T.ToTensor(),
    ])

    query_loader = DataLoader(
        Preprocessor(dataset.query,
                     root=osp.join(dataset.images_dir, dataset.query_path), transform=test_transformer),
        batch_size=test_batch, num_workers=workers,
        shuffle=False, pin_memory=True)

    gallery_loader = DataLoader(
        Preprocessor(dataset.gallery,
                     root=osp.join(dataset.images_dir, dataset.gallery_path), transform=test_transformer),
        batch_size=test_batch, num_workers=workers,
        shuffle=False, pin_memory=True)

    return dataset, query_loader, gallery_loader

##**Main Function - ALREADY MODIFIED BASED ON OUR EXPERIMENT _ CAN USE THIS**
We have reused parts of this code from the author's implemention and modified to it run our chosen experiment from the list of experiments provided in the paper

NOTE - We need to modify it a bit more after adding encoder and transformer

In [None]:
def main(args):
    cudnn.deterministic = False
    cudnn.benchmark = True

    exp_database_dir = osp.join(args.exp_dir, string.capwords(args.dataset))
    output_dir = osp.join(exp_database_dir, args.method, args.sub_method)
    log_file = osp.join(output_dir, 'log.txt')
    # Redirect print to both console and log file
    sys.stdout = Logger(log_file)

   # Creating the base network - resnet50 with ibn and also creating encoder layers. In the paper they mention that they have used only N-1 encoders (2 in this case)
   # as they have seen slight improvement in the performance.The first encoder layer is resnet50 itself.

    feamap_factor = {'layer3': 16}
    # since feature map factor layer 3 is 16 as in the authors code
    hei = args.height // 16
    wid = args.width // 16
    # change the arguments to transmatcher after final implementation
    matcher = TransMatcher(hei * wid, num_features, args.num_trans_layers, args.dim_feedforward).cuda()


    # Criterion - Thier own loss function which we have taken from QAConv-GS as explained in the paper
    criterion = PairwiseMatchingLoss(matcher).cuda()

    # Optimizer - https://pytorch.org/docs/stable/optim.html
    base_param_ids = set(map(id, model.base.parameters()))
    new_params = [p for p in model.parameters() if
                  id(p) not in base_param_ids]
    param_groups = [
        {'params': model.base.parameters(), 'lr': 0.1 * args.lr} ,# The have specified in paper, lr for backbone network is 0.0005
        {'params': new_params, 'lr': args.lr},
        {'params': matcher.parameters(), 'lr': args.lr}]

    optimizer = torch.optim.SGD(param_groups, lr=args.lr, momentum=0.9)

    # Load from checkpoint
    start_epoch = 0

    if args.resume or args.evaluate:
        print('Loading checkpoint...')
        if args.resume and (args.resume != 'ori'):
            checkpoint = load_checkpoint(args.resume)
        else:
            checkpoint = load_checkpoint(osp.join(output_dir, 'checkpoint.pth.tar'))
        model.load_state_dict(checkpoint['model'])
        criterion.load_state_dict(checkpoint['criterion'])
        optimizer.load_state_dict(checkpoint['optim'])
        start_epoch = checkpoint['epoch']

        print("=> Start epoch {} ".format(start_epoch))

    model = nn.DataParallel(model).cuda()

    # Create data loaders
    save_path = None
    if args.gs_save:
        save_path = output_dir
    dataset, num_classes, train_loader, _, _ = get_data(args.dataset, args.data_dir, model, matcher, save_path, args)

    # Decay LR by a factor of 0.1 every step_size epochs
    lr_scheduler = StepLR(optimizer, step_size=args.step_size, gamma=0.1, last_epoch=start_epoch-1)

    if not args.evaluate:
        # Trainer
        trainer = Trainer(model, criterion, args.clip_value)
        t0 = time.time()

        # Start training
        for epoch in range(start_epoch, args.epochs):
            loss, acc = trainer.train(epoch, train_loader, optimizer)

            lr = list(map(lambda group: group['lr'], optimizer.param_groups))
            lr_scheduler.step()
            train_time = time.time() - t0
            epoch1 = epoch + 1

            print(
                '* Finished epoch %d at lr=[%g, %g, %g]. Loss: %.3f. Acc: %.2f%%. Training time: %.0f seconds.                  \n'
                % (epoch1, lr[0], lr[1], lr[2], loss, acc * 100, train_time))

            save_checkpoint({
                'model': model.module.state_dict(),
                'criterion': criterion.state_dict(),
                'optim': optimizer.state_dict(),
                'epoch': epoch1,
            }, fpath=osp.join(output_dir, 'checkpoint.pth.tar'))

    json_file = osp.join(output_dir, 'results.json')
    
    if not args.evaluate:
        arg_dict = {'train_dataset': args.dataset, 'exp_dir': args.exp_dir, 'method': args.method, 'sub_method': args.sub_method}
        with open(json_file, 'a') as f:
            json.dump(arg_dict, f)
            f.write('\n')
        train_dict = {'train_dataset': args.dataset, 'loss': loss, 'acc': acc, 'epochs': epoch1, 'train_time': train_time}
        with open(json_file, 'a') as f:
            json.dump(train_dict, f)
            f.write('\n')

    # Final test
    print('Evaluate the learned model:')
    t0 = time.time()

    # Evaluator
    evaluator = Evaluator(model)

    test_names = args.testset.strip().split(',')
    for test_name in test_names:
        if test_name not in datasets.names():
            print('Unknown dataset: %s.' % test_name)
            continue

        t1 = time.time()
        testset, test_query_loader, test_gallery_loader = \
            get_test_data(test_name, args.data_dir, args.height, args.width, args.workers, args.test_fea_batch)

        if not args.do_tlift:
            testset.has_time_info = False

        test_rank1, test_mAP, test_rank1_rerank, test_mAP_rerank, test_rank1_tlift, test_mAP_tlift, test_dist, \
        test_dist_rerank, test_dist_tlift, pre_tlift_dict = \
            evaluator.evaluate(matcher, testset, test_query_loader, test_gallery_loader, 
                                args.test_gal_batch, args.test_prob_batch,
                               args.tau, args.sigma, args.K, args.alpha)

        test_time = time.time() - t1

        test_dict = {'test_dataset': test_name, 'rank1': test_rank1, 'mAP': test_mAP, 'test_time': test_time}
        print('  %s: rank1=%.1f, mAP=%.1f.\n' % (test_name, test_rank1 * 100, test_mAP * 100))

        with open(json_file, 'a') as f:
            json.dump(test_dict, f)
            f.write('\n')

    test_time = time.time() - t0

    if not args.evaluate:
        print('Finished training at epoch %d, loss = %.3f, acc = %.2f%%.\n'
              % (epoch1, loss, acc * 100))
        print("Total training time: %.3f sec. Average training time per epoch: %.3f sec." % (
            train_time, train_time / (epoch1 - start_epoch)))
    print("Total testing time: %.3f sec.\n" % test_time)


%%capture
# pass command line arguments directly in the code cell
# epoch - 15 for market dataset and 4 for randperson as metioned in the paper
# All the values of the arguments specified here are as exactly in the paper
working_dir = os.path.dirname(os.path.abspath("main.ipynb"))
args = argparse.Namespace(dataset='market', testset='cuhk03_np_detected', batch_size=64, workers=8, height=384, width=128, final_layer='layer3', neck=512,
                          ibn='b', nhead=1, num_trans_layers=3, dim_feedforward=2048, min_size=0, max_size=0.8, lr=0.005, epochs=15, step_size=10, clip_value=4,
                          num_instance=4, evaluate=False, test_fea_batch=256, test_gal_batch=128, test_prob_batch=128, data_dir=osp.join(working_dir, 'data'),
                         exp_dir=osp.join(working_dir, 'Exp'), method='TransMatcher', sub_method='res50-ibnb-layer3', arch='resnet50',resume='', gs_save=False, combine_all=False, gs_verbose=True)

main(args)

In [None]:
def main(args):
    # gpu stuff
    cudnn.deterministic = False
    cudnn.benchmark = True

    # log files
    exp_database_dir = osp.join(args.exp_dir, string.capwords(args.dataset))
    output_dir = osp.join(exp_database_dir, args.method, args.sub_method)
    log_file = osp.join(output_dir, 'log.txt')

    # since feature map factor layer 3 is 16 as in the authors code
    hei = args.height // 16
    wid = args.width // 16
    
    # Arguments for TransMatcher class
    seq_len = hei * wid
    d_model = 512
    nhead = 1
    num_encoder_layers = 2
    num_decoder_layers = 3
    dim_feedforward = 2048
    dropout = 0.

     # Calling Transmatcher method
    matcher = TransMatcher(dseq_len, _model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout).cuda()

    # Criterion - Thier own loss function which we have taken from QAConv-GS as explained in the paper
    criterion = PairwiseMatchingLoss(matcher).cuda()

     # Optimizer - https://pytorch.org/docs/stable/optim.html
     # taken from authors code but made a few modifications
    base_param_ids = set(map(id, model.base.parameters()))
    new_params = [p for p in model.parameters() if
                  id(p) not in base_param_ids]
    param_groups = [
        {'params': model.base.parameters(), 'lr': 0.1 * args.lr} ,# The have specified in paper, lr for backbone network is 0.0005
        {'params': new_params, 'lr': args.lr},
        {'params': matcher.parameters(), 'lr': args.lr}]

    optimizer = torch.optim.SGD(param_groups, lr=args.lr, momentum=0.9)

    # Load from checkpoint
    start_epoch = 0

    #This part - check

    if args.resume or args.evaluate:
        print('Loading checkpoint...')
        if args.resume and (args.resume != 'ori'):
            checkpoint = load_checkpoint(args.resume)
        else:
            checkpoint = load_checkpoint(osp.join(output_dir, 'checkpoint.pth.tar'))
        model.load_state_dict(checkpoint['model'])
        criterion.load_state_dict(checkpoint['criterion'])
        optimizer.load_state_dict(checkpoint['optim'])
        start_epoch = checkpoint['epoch']

        print("=> Start epoch {} ".format(start_epoch))

    # parallel computation
    model = nn.DataParallel(model).cuda()

    # Create data loaders
    # Same as in author's code
    save_path = None
    if args.gs_save:
        save_path = output_dir
    dataset, num_classes, train_loader, _, _ = get_data(args.dataset, args.data_dir, model, matcher, save_path, args)

    # Decay LR by a factor of 0.1 every step_size epochs
    lr_scheduler = StepLR(optimizer, step_size=args.step_size, gamma=0.1, last_epoch=start_epoch-1)

    # Training starts
    if not args.evaluate:
        # Trainer
        trainer = Trainer(model, criterion, args.clip_value)
        t0 = time.time()

        # Start training - 15 epochs as mentioned in the paper
        for epoch in range(start_epoch, 15):
            loss, acc = trainer.train(epoch, train_loader, optimizer)

            lr = list(map(lambda group: group['lr'], optimizer.param_groups))
            lr_scheduler.step()
            train_time = time.time() - t0
            epoch1 = epoch + 1

            print(
                '* Finished epoch %d at lr=[%g, %g, %g]. Loss: %.3f. Acc: %.2f%%. Training time: %.0f seconds.                  \n'
                % (epoch1, lr[0], lr[1], lr[2], loss, acc * 100, train_time))

            save_checkpoint({
                'model': model.module.state_dict(),
                'criterion': criterion.state_dict(),
                'optim': optimizer.state_dict(),
                'epoch': epoch1,
            }, fpath=osp.join(output_dir, 'checkpoint.pth.tar'))

    json_file = osp.join(output_dir, 'results.json')
    
    if not args.evaluate:
        arg_dict = {'train_dataset': args.dataset, 'exp_dir': args.exp_dir, 'method': args.method, 'sub_method': args.sub_method}
        with open(json_file, 'a') as f:
            json.dump(arg_dict, f)
            f.write('\n')
        train_dict = {'train_dataset': args.dataset, 'loss': loss, 'acc': acc, 'epochs': epoch1, 'train_time': train_time}
        with open(json_file, 'a') as f:
            json.dump(train_dict, f)
            f.write('\n')

    # Evaluation starts
    print('Evaluate the learned model:')
    t0 = time.time()

    # Evaluator
    evaluator = Evaluator(model)

    test_names = args.testset.strip().split(',')
    for test_name in test_names:
        if test_name not in datasets.names():
            print('Unknown dataset: %s.' % test_name)
            continue

        t1 = time.time()
        testset, test_query_loader, test_gallery_loader = \
            get_test_data(test_name, args.data_dir, args.height, args.width, args.workers, args.test_fea_batch)

        if not args.do_tlift:
            testset.has_time_info = False

        test_rank1, test_mAP, test_rank1_rerank, test_mAP_rerank, test_rank1_tlift, test_mAP_tlift, test_dist, \
        test_dist_rerank, test_dist_tlift, pre_tlift_dict = \
            evaluator.evaluate(matcher, testset, test_query_loader, test_gallery_loader, 
                                args.test_gal_batch, args.test_prob_batch,
                               args.tau, args.sigma, args.K, args.alpha)

        test_time = time.time() - t1

        test_dict = {'test_dataset': test_name, 'rank1': test_rank1, 'mAP': test_mAP, 'test_time': test_time}
        print('  %s: rank1=%.1f, mAP=%.1f.\n' % (test_name, test_rank1 * 100, test_mAP * 100))

        with open(json_file, 'a') as f:
            json.dump(test_dict, f)
            f.write('\n')

    test_time = time.time() - t0

    if not args.evaluate:
        print('Finished training at epoch %d, loss = %.3f, acc = %.2f%%.\n'
              % (epoch1, loss, acc * 100))
        print("Total training time: %.3f sec. Average training time per epoch: %.3f sec." % (
            train_time, train_time / (epoch1 - start_epoch)))
    print("Total testing time: %.3f sec.\n" % test_time)


    %%capture
#!pip install argparse
import os
import sys

# pass command line arguments directly in the code cell
# epoch - 15 for market dataset and 4 for randperson as metioned in the paper
# All the values of the arguments specified here are as exactly in the paper
working_dir = os.path.dirname(os.path.abspath("main.ipynb"))
args = argparse.Namespace(dataset='market', testset='cuhk03_np_detected', batch_size=64, workers=8, height=384, width=128,
                          min_size=0, max_size=0.8, lr=0.005, epochs=15, step_size=10, clip_value=4, tau=100, sigma=200, K=10, alpha=0.2, test_fea_batch=128, test_gal_batch=128, test_prob_batch=128,
                          num_instance=4, evaluate=False, test_fea_batch=256, test_gal_batch=128, test_prob_batch=128, data_dir=osp.join(working_dir, 'data'),
                          exp_dir=osp.join(working_dir, 'Exp'), method='TransMatcher', sub_method='res50-ibnb-layer3', arch='resnet50',resume='', gs_save=False, combine_all=False, gs_verbose=True)

main(args)