In [1]:
import torch
from torch.utils.data import Dataset
import h5py
import json
import os
from PIL import Image
import torchvision.transforms as transforms
import faiss
import numpy as np

In [2]:
class TrainRetrievalDataset(Dataset):
    """
    A PyTorch Dataset class to be used in a PyTorch DataLoader to create batches.
    """

    def __init__(self, data_folder, data_name):
        """
        :param data_folder: folder where data files are stored
        :param data_name: base name of processed datasets
        :param split: split, one of 'TRAIN', 'VAL', or 'TEST'
        :param transform: image transform pipeline
        """
        self.data_folder=data_folder

        with open(os.path.join(data_folder, "TRAIN" + '_IMGPATHS_' + data_name + '.json'), 'r') as j:
            self.imgpaths = json.load(j)

        #self.imgpaths=self.imgpaths[:10]
        #print("self images", self.imgpaths)
        ##TODO:REMOVE

        # Total number of datapoints
        self.dataset_size = len(self.imgpaths)
        #print("this is the actual len on begin init", self.dataset_size)


        self.transform = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],  # mean=IMAGENET_IMAGES_MEAN, std=IMAGENET_IMAGES_STD
                                     std=[0.229, 0.224, 0.225])
        ])

    def __getitem__(self, i):
        # Remember, the Nth caption corresponds to the (N // captions_per_image)th image
        img = Image.open(self.data_folder+"/"+self.imgpaths[i])
        img = self.transform(img)
        #print("i of retrieval dataset",i)
        return img, i

    def __len__(self):
        #print("this is the actual len on __len", self.dataset_size)
        return self.dataset_size

In [3]:
class ImageRetrieval():

    def __init__(self, dim_examples, encoder, train_dataloader_images, device):
        #print("self dim exam", dim_examples)
        self.datastore = faiss.IndexFlatL2(dim_examples) #datastore
        self.encoder= encoder

        #data
        self.device=device
        self.imgs_indexes_of_dataloader = torch.tensor([]).long().to(device)
        #print("self.imgs_indexes_of_dataloader type", self.imgs_indexes_of_dataloader)

        #print("len img dataloader", self.imgs_indexes_of_dataloader.size())
        self._add_examples(train_dataloader_images)
        #print("len img dataloader final", self.imgs_indexes_of_dataloader.size())
        #print("como ficou img dataloader final", self.imgs_indexes_of_dataloader)


    def _add_examples(self, train_dataloader_images):
        print("\nadding input examples to datastore (retrieval)")
        for i, (imgs, imgs_indexes) in enumerate(train_dataloader_images):
            #add to the datastore
            imgs=imgs.to(self.device)
            imgs_indexes = imgs_indexes.long().to(self.device)
            #print("img index type", imgs_indexes)
            encoder_output = self.encoder(imgs)

            encoder_output = encoder_output.view(encoder_output.size()[0], -1, encoder_output.size()[-1])
            input_img = encoder_output.mean(dim=1)

            self.datastore.add(input_img.cpu().numpy())

            if i%5==0:
                print("i and img index of ImageRetrival",i, imgs_indexes)
                print("n of examples", self.datastore.ntotal)
            self.imgs_indexes_of_dataloader= torch.cat((self.imgs_indexes_of_dataloader,imgs_indexes))



    def retrieve_nearest_for_train_query(self, query_img, k=2):
        #print("self query img", query_img)
        D, I = self.datastore.search(query_img, k)     # actual search
        #print("all nearest", I)
        #print("I firt", I[:,0])
        #print("if you choose the first", self.imgs_indexes_of_dataloader[I[:,0]])
        nearest_input = self.imgs_indexes_of_dataloader[I[:,1]]
        #print("the nearest input is actual the second for training", nearest_input)
        #nearest_input = I[0,1]
        #print("actual nearest_input", nearest_input)
        return nearest_input

    def retrieve_nearest_for_val_or_test_query(self, query_img, k=1):
        D, I = self.datastore.search(query_img, k)     # actual search
        nearest_input = self.imgs_indexes_of_dataloader[I[:,0]]
        #print("all nearest", I)
        #print("the nearest input", nearest_input)
        return nearest_input

In [4]:
import torch
from torch import nn
import torchvision
import fasttext
import numpy as np

class Encoder(nn.Module):
    """
    Encoder.
    """

    def __init__(self, encoded_image_size=14):
        super(Encoder, self).__init__()
        self.enc_image_size = encoded_image_size

        resnet = torchvision.models.resnet101(pretrained=True)  # pretrained ImageNet ResNet-101

        # Remove linear and pool layers (since we're not doing classification)
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)

        # Resize image to fixed size to allow input images of variable size
        self.adaptive_pool = nn.AdaptiveAvgPool2d((encoded_image_size, encoded_image_size))

        self.fine_tune()

    def forward(self, images):
        """
        Forward propagation.
        :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
        :return: encoded images
        """
        out = self.resnet(images)  # (batch_size, 2048, image_size/32, image_size/32)
        out = self.adaptive_pool(out)  # (batch_size, 2048, encoded_image_size, encoded_image_size)
        out = out.permute(0, 2, 3, 1)  # (batch_size, encoded_image_size, encoded_image_size, 2048)
        return out

    def fine_tune(self, fine_tune=True):
        """
        Allow or prevent the computation of gradients for convolutional blocks 2 through 4 of the encoder.
        :param fine_tune: Allow?
        """
        for p in self.resnet.parameters():
            p.requires_grad = False
        # If fine-tuning, only fine-tune convolutional blocks 2 through 4
        for c in list(self.resnet.children())[5:]:
            for p in c.parameters():
                p.requires_grad = fine_tune

In [5]:
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
from src.configs.datasets import FeaturesDataset

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Data parameters
# folder with data files saved by create_input_files.py
data_folder = '/home/starksultana/Documentos/MEIC/5o_ano/Tese/code/remote-sensing-image-captioning/experiments/encoder/inputs'
# base name shared by data files
data_name = 'flickr8k'

# Model parameters
emb_dim = 300  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5


# sets device for model and PyTorch tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device", device)
# set to true only if inputs to model are fixed size; otherwise lot of computational overhead
cudnn.benchmark = True

# Training parameters
start_epoch = 0
# number of epochs to train for (if early stopping is not triggered)
epochs = 100
# keeps track of number of epochs since there's been an improvement in validation BLEU
epochs_since_improvement = 0
batch_size = 32
workers = 1  # for data-loading; right now, only 1 works with h5py
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
grad_clip = 5.  # clip gradients at an absolute value of
alpha_c = 1.  # regularization parameter for 'doubly stochastic attention', as in the paper
best_bleu4 = 0.  # BLEU-4 score right now
print_freq = 100  # print training/validation stats every __ batches
fine_tune_encoder = False  # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none

# TrainRetrievalDataset
train_retrieval_loader = torch.utils.data.DataLoader(
    TrainRetrievalDataset(data_folder, data_name),
    batch_size=batch_size, shuffle=True, num_workers=workers)#, pin_memory=True)

train_loader = torch.utils.data.DataLoader(
        FeaturesDataset(data_folder, data_name, 'TRAIN'),
        batch_size=batch_size, shuffle=True, num_workers=workers)#, pin_memory=True)

encoder = Encoder()
encoder.fine_tune(fine_tune_encoder)
encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                     lr=encoder_lr) if fine_tune_encoder else None

image_retrieval = ImageRetrieval(2048, encoder, train_retrieval_loader, device)

neighbors = []
with torch.no_grad():
    for i,img in enumerate(train_loader):

        imgs = imgs.to(device)


        # Forward prop.
        imgs = encoder(imgs)
        imgs = imgs.view(imgs.size()[0], -1, imgs.size()[-1])
        #print("this was the imgs out", imgs.size())
        input_imgs = imgs.mean(dim=1)
        nearest_imgs = image_retrieval.retrieve_nearest_for_train_query(input_imgs.cpu().numpy())
        neighbors.append(nearest_imgs)

device cpu


FileNotFoundError: [Errno 2] No such file or directory: '/home/starksultana/Documentos/MEIC/5o_ano/Tese/code/remote-sensing-image-captioning/experiments/encoder/inputs/TRAIN_IMGPATHS_flickr8k.json'