In [109]:
import sys
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchvision

from torchvision. models import vgg16

torch.manual_seed(777)

<torch._C.Generator at 0x249de623150>

In [110]:
# base 경로
os.chdir("D:/2021/2학기 수업/CV/VPR/NetVLAD_hhd")

In [111]:
print(os.getcwd())

D:\2021\2학기 수업\CV\VPR\NetVLAD_hhd


In [112]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [132]:
class NetVLAD(nn.Module):
    
    def __init__(self, num_clusters = 12, dim = 128, alpha = 100.0, normalize_input = True):
        super(NetVLAD, self).__init__()
        self.num_clusters = num_clusters
        self.dim = dim
        self.alpha = alpha
        self.normalize_input = normalize_input
        self.conv = nn.Conv2d(dim, num_clusters, 1, bias = False) # 논문에서는 bias: True, --> False 는 version 2
        self.centroids = nn.Parameter(torch.rand(num_clusters, dim))
        self._init_params()
        
    def _init_params(self):
        # w_k = 2 * alpha * c_k
        self.conv.weight = nn.Parameter(
            (2.0 * self.alpha * self.centroids).unsqueeze(-1).unsqueeze(-1)
        )
        
    def forward(self, x):
        
        # N = 3: 3장의 이미지 (img, pos, neg) ; C = 512 (VGG 16 기준)
        N, C = x.shape[:2]
        
        if self.normalize_input:
            x = F.normalize(x, p = 2, dim = 1)
        
        # soft-assignment
        soft_assign = self.conv(x).view(N, self.num_clusters, -1) # (3, 16, (W x H))
        soft_assign = F.softmax(soft_assign, dim = 1)
        
        x_flatten = x.view(N, C, -1) # (3, 512, (W x H))
        
        vlad = torch.zeros([N, self.num_clusters, C], dtype = x.dtype, layout = x.layout, device = x.device) # (3 x 16 x 512) 
        
        # x_flatten.unsqueeze(0).permute(1, 0, 2, 3): (3, 1, 512, (W x H))
        # centroids: 16 x 512 --> 16개의 centroid, 512 차원
        # centroids[0, :]: 1 x 512 --> 0 번째 centroid의 좌표
        # centroids[0, :].expand(x_flatten.size(-1), -1, -1): ((W x H), 1, 512)        
        for C in range(self.num_clusters):
            residual = x_flatten.unsqueeze(0).permute(1, 0, 2, 3) - \
                self.centroids[C:C + 1, :].expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).unsqueeze(0)
            residual *= soft_assign[:, C:C+1, :].unsqueeze(2)
            vlad[:, C:C+1, :] = residual.sum(dim = -1)
            
        vlad = F.normalize(vlad, p = 2, dim = 2)
        vlad = vlad.view(x.size(0), -1)
        vlad = F.normalize(vlad, p = 2, dim = 1)
        
        return vlad

In [133]:
a = torch.tensor([[1, 2, 3]])
print(a.shape)
a = a.expand(10, -1, -1)
print(a)

torch.Size([1, 3])
tensor([[[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]],

        [[1, 2, 3]]])


In [134]:
encoder = vgg16(pretrained = True)
# vgg 16 layer 중 conv5-3 까지의 layer만 사용. (512개의 3 x 3 x 512 커널 필터 & ReLU는 사용 안 한 상태)
layers = list(encoder.features.children())[:-2]
#print(layers[-1])
for l in layers[:-5]:
    for p in l.parameters():
        p.requires_grad = False

model = nn.Module()

encoder = nn.Sequential(*layers)
model.add_module('encoder', encoder)

dim = list(encoder.parameters())[-1].shape[0]

#print(dim)
#print(list(encoder.parameters())[-3].shape)

net_vlad = NetVLAD(num_clusters = 16, dim = dim)
model.add_module('pool', net_vlad)

model = model.cuda()

In [135]:
load_model = torch.load('./pittsburgh_checkpoint.pth.tar')
model.load_state_dict(load_model['state_dict'])
#print(load_model['state_dict'])

<All keys matched successfully>

In [136]:
import torch.utils.data as data
import torchvision.transforms as transforms

from random import choice
from os.path import join, exists
from collections import namedtuple
from scipy.io import loadmat
from PIL import Image
from sklearn.neighbors import NearestNeighbors

In [137]:
def parse_dbStruct(path):
    mat = loadmat(path)
    
    matStruct = mat['dbStruct'][0]
    
    dataset = 'dataset'
    
    whichSet = 'VPR'
    
    dbImage = matStruct[0]
    locDb = matStruct[1]
    
    qImage = matStruct[2]
    locQ = matStruct[3]
    
    numDb = matStruct[4].item()
    numQ = matStruct[5].item()
    
    posDistThr = matStruct[6].item()
    posDistSqThr = matStruct[7].item()
    
    return dbStruct(whichSet, dataset, dbImage, locDb, qImage, locQ, numDb, numQ, posDistThr, posDistSqThr)

dbStruct = namedtuple('dbStruct', ['whichSet', 'dataset', 'dbImage', 'locDb', 'qImage', 'locQ', 'numDb',
                                   'numQ', 'posDistThr', 'posDistSqThr'])

class BerlinDataset(data.Dataset):
    
    def __init__(self, condition = 'train'):
        self.dbStruct = parse_dbStruct('berlin.mat')
        self.input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean = [0.485, 0.456, 0.406],
                                 std = [0.229, 0.224, 0.225]),
        ])
        
        self.condition = condition
        
        if self.condition == 'train':
            self.images = [join(os.getcwd(), dblm.replace(' ','')) for dblm in self.dbStruct.dbImage]
        elif self.condition == 'test':
            self.images = [join(os.getcwd(), qlm.replace(' ','')) for qlm in self.dbStruct.qImage]
        else:
            self.images = [join(os.getcwd(), dblm.replace(' ','')) for dblm in self.dbStruct.dbImage]
            
        self.positives = None
        self.distances = None
        
        self.getPositives()
        
    def __getitem__(self, idx):
        
        if self.condition == 'train':
            img = Image.open(self.images[idx])
            img = self.input_transform(img)
            
            pos_list = self.positives[idx].tolist()
            pos_list.remove(idx)
            pos_idx = self.positives[idx][np.random.randint(0, len(self.positives[idx]))]
            pos_img = Image.open(self.images[pos_idx])
            pos_img = self.input_transform(pos_img)
            
            pos_list = pos_list + [idx]
            neg_idx = choice([i for i in range(len(self.images)) if i not in pos_list])
            neg_img = Image.open(self.images[neg_idx])
            neg_img = self.input_transform(neg_img)
            img = torch.stack([img, pos_img, neg_img], dim=0)
            label = torch.Tensor([0, 0, 1])
            
            return img, label
        
        
        elif self.condition == 'test':
            img = Image.open(self.images[idx])
            img = self.input_transform(img)
            
            return img
        
        else:
            img = Image.open(self.images[idx])
            img = self.input_transform(img)
            
            return img
        
    def __len__(self):
        return len(self.images)
    
    def getPositives(self):
        if self.condition == 'train':
            knn = NearestNeighbors(n_jobs = 1)
            knn.fit(self.dbStruct.locDb)
            
            self.distances, self.positives = knn.radius_neighbors(self.dbStruct.locDb, radius=self.dbStruct.posDistThr)
        else:
            knn = NearestNeighbors(n_jobs = 1)
            knn.fit(self.dbStruct.locDb)
            
            self.distances, self.positives = knn.radius_neighbors(self.dbStruct.locQ, radius=self.dbStruct.posDistThr)
        
        return self.positives

In [138]:
train_dataset = BerlinDataset(condition = 'train')
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 1, shuffle = True, num_workers = 0)

In [139]:
class AvgMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [140]:
epochs = 5
global_batch_size = 8
lr = 0.00001
momentum = 0.9
weightDecay = 0.001
losses = AvgMeter()
best_loss = 100.0
margin = 0.1

criterion = nn.TripletMarginLoss(margin = margin ** 0.5, p = 2, reduction = 'sum').cuda()
optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr = lr, 
                            momentum = momentum, weight_decay = weightDecay)

model.train()

for epoch in range(epochs):
    for batch_idx, (train_image, train_label) in enumerate(train_loader):
        output_train = model.encoder(train_image.squeeze().cuda())
        
        #if batch_idx == 0:
            #print(train_image.squeeze().shape) # 3 x 3 x 480 x 640: img 3장(img, pos, neg), dim(channel), H, W
            #print(output_train.shape) # 3 x 512 x 30 x 40: H, W 의 크기 1/2^4 으로 줄어듬. vgg16 참고
            
        output_train = model.pool(output_train)
        
        #if batch_idx == 0:
            #print(output_train.shape)
        triplet_loss = criterion(output_train[0].reshape(1, -1),
                                 output_train[1].reshape(1, -1),
                                 output_train[2].reshape(1, -1))
        
        if batch_idx == 0:
            optimizer.zero_grad()
            #print(triplet_loss.item())
        
        triplet_loss.backward(retain_graph = True)
        losses.update(triplet_loss.item())
        
        if (batch_idx + 1) % global_batch_size == 0:
            for name, p in model.named_parameters():
                if p.requires_grad:
                    p.grad /= global_batch_size
                    #if batch_idx == 7:
                        #print(p.shape)
                
                optimizer.step()
                optimizer.zero_grad()
                
        if batch_idx % 20 == 0 and batch_idx != 0:
            print('epoch : {}, batch_idx : {}, triplet_loss : {}'.format(epoch, batch_idx, losses.avg))
    
    if best_loss > losses.avg:
        best_path = 'out_model/best_model.pt'
        torch.save(model.state_dict(), best_path)
    
    model_save_name = 'out_model/model_{:02d}.pt'.format(epoch)
    torch.save(model.state_dict(), model_save_name)

epoch : 0, batch_idx : 20, triplet_loss : 0.13946509928930373
epoch : 0, batch_idx : 40, triplet_loss : 0.1362961792364353
epoch : 0, batch_idx : 60, triplet_loss : 0.15165122512911186
epoch : 0, batch_idx : 80, triplet_loss : 0.14034086171491647
epoch : 0, batch_idx : 100, triplet_loss : 0.13942886402111243
epoch : 0, batch_idx : 120, triplet_loss : 0.13726824324978285
epoch : 0, batch_idx : 140, triplet_loss : 0.1331700530458004
epoch : 0, batch_idx : 160, triplet_loss : 0.1355095076264802
epoch : 0, batch_idx : 180, triplet_loss : 0.13900799612972617
epoch : 0, batch_idx : 200, triplet_loss : 0.14055891327597014
epoch : 0, batch_idx : 220, triplet_loss : 0.1403128102893743
epoch : 0, batch_idx : 240, triplet_loss : 0.14231352044339002
epoch : 0, batch_idx : 260, triplet_loss : 0.14059329535312579
epoch : 0, batch_idx : 280, triplet_loss : 0.13697510639543636
epoch : 0, batch_idx : 300, triplet_loss : 0.13731027679189892
epoch : 1, batch_idx : 20, triplet_loss : 0.13362771005772833
e

In [142]:
from tqdm import tqdm

cluster_dataset = BerlinDataset(condition = 'cluster')
cluster_loader = torch.utils.data.DataLoader(cluster_dataset, batch_size = 1, shuffle = False, num_workers = 0)

train_feature_list = list()

model.eval()

with torch.no_grad():
    for batch_idx, train_img in tqdm(enumerate(cluster_loader)):
        output_train = model.encoder(train_img.cuda())
        output_train = model.pool(output_train)
        train_feature_list.append(output_train.squeeze().detach().cpu().numpy())

train_feature_list = np.array(train_feature_list)

314it [00:09, 33.85it/s]


In [143]:
test_dataset = BerlinDataset(condition = 'test')
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 1, shuffle = False, num_workers = 0)

In [144]:
test_feature_list = list()

with torch.no_grad():
    for batch_idx, test_img in tqdm(enumerate(test_loader)):
        output_test = model.encoder(test_img.cuda())
        output_test = model.pool(output_test)
        test_feature_list.append(output_test.squeeze().detach().cpu().numpy())
        
test_feature_list = np.array(test_feature_list)

280it [00:09, 28.08it/s]


In [145]:
import faiss

n_values = [1, 5, 10, 20]
faiss_index = faiss.IndexFlatL2(train_feature_list.shape[1])
faiss_index.add(train_feature_list)
_, predictions = faiss_index.search(test_feature_list, max(n_values))

In [146]:
import json

file_path = "./submit.json"

data = {}
data['Query'] = list()

for i in range(len(predictions)):
    data_t = [("id",i),("positive",predictions[i].tolist())]
    data_t = dict(data_t)
    data['Query'].append(data_t)

with open(file_path, 'w') as outfile:
    json.dump(data, outfile, indent=4)