- read from a local json file for annotation, transform to index
- read from url for pictures, get features with encoder model provided by the author

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
# !pwd  # /content  # import vocab
!cp /content/gdrive/MyDrive/PPlusPlus/utils/build_vocab.py /content/build_vocab.py
import torch.nn as nn
import torchvision.models as models
from torch.autograd import Variable
import torch
from torchvision import transforms
import json
from PIL import Image
import requests
from build_vocab import Vocabulary
import pickle
import time
import nltk
from nltk.collections import Counter
nltk.download('punkt')

Mounted at /content/gdrive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
embed_size=256
hidden_size=512
num_layers=1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class EncoderCNN(nn.Module):
  def __init__(self, embed_size):
    """Load the pretrained ResNet-152 and replace top fc layer."""
    super(EncoderCNN, self).__init__()
    # BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    resnet = models.resnet152(weights='DEFAULT')
    modules = list(resnet.children())[:-1]      # delete the last fc layer.
    self.resnet = nn.Sequential(*modules)
    # Linear(in_features=2048, out_features=256, bias=True)
    self.linear = nn.Linear(resnet.fc.in_features, embed_size)
    # BatchNorm1d(256, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
    self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
    self.init_weights()
      
  def init_weights(self):
    """Initialize the weights."""
    self.linear.weight.data.normal_(0.0, 0.02)
    self.linear.bias.data.fill_(0)
      
  def forward(self, images):  # return from load_image() in utils/sample.py
    """Extract the image feature vectors."""
    features = self.resnet(images)  # {Tensor: (1, 2048, 1, 1)}
    features = Variable(features.data)  # {Tensor: (1, 2048, 1, 1)}
    features = features.view(features.size(0), -1)  # {Tensor: (1, 2048)}
    features = self.bn(self.linear(features))  # {Tensor: (1, 256)}
    return features
    

*yield* training pairs

In [None]:
def getTrainingPair():
  """
  get training item in desired input format (vectors of indices)
  :return:
  features: vector after pre-trained CNN
  captions: to be used in embeddings = self.embed(captions)
  """

  cnn = EncoderCNN(embed_size)
  cnn.eval()
  # self.encoder.load_state_dict(torch.load(self.encoder_path, map_location={'cuda:0': 'cpu'}))
  cnn.load_state_dict(torch.load('/content/gdrive/MyDrive/PPlusPlus/models/vg-encoder-5-3000.pkl', 
                                 map_location={'cuda:0': 'cpu'}))
  if torch.cuda.is_available():
    cnn.cuda()
  transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                        (0.229, 0.224, 0.225))])
  json_data = json.loads(open('/content/gdrive/MyDrive/PPlusPlus/vg_data/region_descriptions.json', 'r').read())
  with open("/content/gdrive/MyDrive/PPlusPlus/vg_data/vocab_small.pkl", 'rb') as f:
    vocab = pickle.load(f)  # a Vocabulary() from utils/build_vocab.py

  counter = 0
  for each_dict in json_data:
    #############
    if counter > 41005:
      break
    #############
    # https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg
    file_url = "https://cs.stanford.edu/people/rak248/VG_100K_2/" + str(each_dict["id"]) + ".jpg"
    response = requests.get(file_url, stream=True)
    # print(response)  ####################
    if response.status_code == 200:  # only if url is accessable
      im = Image.open(requests.get(file_url, stream=True).raw)
      # each im contains a list of regions
      reg_list = each_dict['regions']
      for each_region in reg_list:
        ################
        counter += 1
        if counter % 10000 == 0:
          print(counter)
        if counter < 41000:
          continue
        if counter > 41005:
          break
        ################
        # {"region_id": int, "width": int, "height": int, "image_id": int, "phrase": str, "y": int, "x": int}
        left = each_region["x"]  # x
        top = each_region["y"]  # y
        right = each_region["x"] + each_region["width"]  # x+width
        bottom = each_region["y"] + each_region["height"]  # y+height
        im1 = im.crop((left, top, right, bottom))
        # im1.show()  ###############
        # resize
        im1 = im1.resize([224, 224])
        im = im.convert('RGB')
        # transform (1, 3, 224, 224)
        im1 = transform(im1).unsqueeze(0)
        # feed into cnn
        # self.encoder(to_var(load_image(url, self.transform)))
        # to_var() from utils/sample.py
        if torch.cuda.is_available():
          im1 = im1.cuda()  # {Tensor: (1, 3, 224, 224)}
        features = cnn(Variable(im1))  # cnn.forward(Variable(im1))

        # read captions, get index
        caption = each_region['phrase']
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        # print(tokens)  ##################
        captions = [vocab.word2idx["<start>"]]
        # + [vocab.word2idx[word] for word in tokens]]
        len_token = 0
        for word in tokens:
          if word in vocab.word2idx:
            captions.append(vocab.word2idx[word])
          else:
            captions.append(vocab.word2idx["<unk>"])
          len_token += 1  # 1 - 10
          if len_token > 9: break
        captions.append(vocab.word2idx["<end>"])
        # padding
        while len_token < 10:
          captions.append(vocab.word2idx["<pad>"])
          len_token += 1
        captions = torch.tensor(captions)
        yield features, captions

# for feature, captions in getTrainingPair():
#   print(feature.shape)
#   print(feature, '\n', captions)
#   break

In [None]:
for feature, captions in getTrainingPair():
  print(feature.shape, '\n', captions)
  # break

In [5]:
# test/eval data 4k with features, caps in tensor, caps in text, file_url
def getTestData():
  """
  get test data in desired input format (vectors of indices)
  :return: in 4k pairs
  features: vector after pre-trained CNN
  captions: to be used in embeddings = self.embed(captions)
  phrase: captions as strings
  file_url: link to the file
  """
  cnn = EncoderCNN(embed_size)
  cnn.eval()
  # self.encoder.load_state_dict(torch.load(self.encoder_path, map_location={'cuda:0': 'cpu'}))
  cnn.load_state_dict(torch.load('/content/gdrive/MyDrive/PPlusPlus/models/vg-encoder-5-3000.pkl', 
                                 map_location={'cuda:0': 'cpu'}))
  if torch.cuda.is_available():
    cnn.cuda()
  transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                        (0.229, 0.224, 0.225))])
  json_data = json.loads(open('/content/gdrive/MyDrive/PPlusPlus/vg_data/region_descriptions.json', 'r').read())
  with open("/content/gdrive/MyDrive/PPlusPlus/vg_data/vocab_small.pkl", 'rb') as f:
    vocab = pickle.load(f)  # a Vocabulary() from utils/build_vocab.py

  counter = 0
  for each_dict in json_data:
    # https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg
    file_url = "https://cs.stanford.edu/people/rak248/VG_100K_2/" + str(each_dict["id"]) + ".jpg"
    response = requests.get(file_url, stream=True)
    # print(response)  ####################
    if response.status_code == 200:  # only if url is accessable
      im = Image.open(requests.get(file_url, stream=True).raw)
      if im.mode != 'RGB': continue  ######################################
      # each im contains a list of regions
      reg_list = each_dict['regions']
      for each_region in reg_list:
        ################
        counter += 1
        if counter < 40001:
          if counter % 10000 == 0:
            print('skipping ', str(counter))
          continue
        ################
        # {"region_id": int, "width": int, "height": int, "image_id": int, "phrase": str, "y": int, "x": int}
        left = each_region["x"]  # x
        top = each_region["y"]  # y
        right = each_region["x"] + each_region["width"]  # x+width
        bottom = each_region["y"] + each_region["height"]  # y+height
        im1 = im.crop((left, top, right, bottom))
        # im1.show()  ###############
        # resize
        im1 = im1.resize([224, 224])
        # transform (1, 3, 224, 224)
        im1 = transform(im1).unsqueeze(0)
        # feed into cnn
        # self.encoder(to_var(load_image(url, self.transform)))
        # to_var() from utils/sample.py
        if torch.cuda.is_available():
          im1 = im1.cuda()  # {Tensor: (1, 3, 224, 224)}
        features = cnn(Variable(im1))  # cnn.forward(Variable(im1))

        # read captions, get index
        phrase = each_region['phrase']
        tokens = nltk.tokenize.word_tokenize(phrase.lower())
        # print(tokens)  ##################
        captions = [vocab.word2idx["<start>"]]
        # + [vocab.word2idx[word] for word in tokens]]
        len_token = 0
        for word in tokens:
          if word in vocab.word2idx:
            captions.append(vocab.word2idx[word])
          else:
            captions.append(vocab.word2idx["<unk>"])
          len_token += 1  # 1 - 10
          if len_token > 9: break
        captions.append(vocab.word2idx["<end>"])
        # padding
        while len_token < 10:
          captions.append(vocab.word2idx["<pad>"])
          len_token += 1
        captions = torch.tensor(captions)
        yield features, captions, phrase, file_url


In [6]:
# yield 4k test/eval data 
batch_temp = []  # batch container
counter = 0
start = time.time()
file_index = 0
for item in getTestData():
  batch_temp.append(item)
  counter += 1
  #########
  # if counter > 3: break
  #########
  if counter % 4000 == 0:  # save every 40k in one pkl
    filename = 'vg_test_' + str(file_index) + ".pkl"
    file_index += 1
    with open(filename, 'wb') as fp:
      torch.save(batch_temp, fp)
    batch_temp = []  # reset the container
    print(str(counter), "\t", str((time.time() - start)/60), "\t", filename)
  elif counter % 2000 == 0:  # time tracking
    print(counter, "\t", str((time.time() - start)/60))

if len(batch_temp) != 0:  # save the last bits that is fewer than 40k
  print(str(len(batch_temp)) + " pairs remaining")
  filename = 'vg_test_' + str(file_index) + ".pkl"
  with open(filename, 'wb') as fp:
    torch.save(batch_temp, fp)

print(str(counter), " pairs in total")
print('finished in ', str((time.time() - start)/60))

skipping  10000
skipping  20000
skipping  30000
skipping  40000
2000 	 8.910920055707296
4000 	 10.13270030816396 	 vg_test_0.pkl
6000 	 11.314764889081319
8000 	 12.504759899775188 	 vg_test_1.pkl
10000 	 13.700434291362763
12000 	 14.887558635075887 	 vg_test_2.pkl
14000 	 16.077542901039124
16000 	 17.270712522665658 	 vg_test_3.pkl
18000 	 18.440325804551442
20000 	 19.63114091157913 	 vg_test_4.pkl
22000 	 20.797709925969443
24000 	 21.97004839579264 	 vg_test_5.pkl
26000 	 23.298837800820667
28000 	 24.51781755288442 	 vg_test_6.pkl
30000 	 25.685112857818602
32000 	 26.87875454823176 	 vg_test_7.pkl
34000 	 28.046668362617492
36000 	 29.20089542865753 	 vg_test_8.pkl
38000 	 30.375420423348746
40000 	 31.5208336631457 	 vg_test_9.pkl


KeyboardInterrupt: ignored

|counter|time|file_name|
| ----- | -- | ------- |


In [8]:
with open('/content/vg_test_6.pkl', 'rb') as f:
  data = torch.load(f, map_location=device)

counter = 0
for features, captions, phrase, file_url in data:
  print(features.shape)
  print(captions)
  print(phrase)
  print(file_url)
  counter += 1
  if counter > 3: break

torch.Size([1, 256])
tensor([  1,  39, 643,  27,   4, 384,   2,   0,   0,   0,   0,   0],
       device='cuda:0')
white plate on the table 
https://cs.stanford.edu/people/rak248/VG_100K_2/1342.jpg
torch.Size([1, 256])
tensor([   1,  221,  100, 1377,    8,   72,    2,    0,    0,    0,    0,    0],
       device='cuda:0')
Glass with ice in it 
https://cs.stanford.edu/people/rak248/VG_100K_2/1342.jpg
torch.Size([1, 256])
tensor([   1,  365,   14,   25, 2852,    2,    0,    0,    0,    0,    0,    0],
       device='cuda:0')
woman wearing a apron 
https://cs.stanford.edu/people/rak248/VG_100K_2/1342.jpg
torch.Size([1, 256])
tensor([   1, 1932,   29,  892,   27,    4,  384,    2,    0,    0,    0,    0],
       device='cuda:0')
bouquet of flowers on the table 
https://cs.stanford.edu/people/rak248/VG_100K_2/1342.jpg


pickle feature and caption pairs in 40ks for training

In [None]:
# yield 40k pairs for training
batch_temp = []  # batch container
counter = 0
last_time = time.time()
file_index = 0
for feature, captions in getTrainingPair():
  batch_temp.append((feature, captions))
  counter += 1
  if counter % 10000 == 0:  # save every 40k in one pkl
    filename = 'vg_feat_cap_' + str(file_index) + ".pkl"
    file_index += 1
    with open(filename, 'wb') as fp:
      torch.save(batch_temp, fp)
    batch_temp = []  # reset the container
    print(str(counter), "\t", str((time.time() - last_time)/60), "\t", filename)
    last_time = time.time()
  elif counter % 10000 == 0:  # time tracking
    print(counter, "\t", str((time.time() - last_time)/60))
    last_time = time.time()

if len(batch_temp) != 0:  # save the last bits that is fewer than 40k
  print(str(len(batch_temp)) + " pairs remaining")
  filename = 'vg_feat_cap_' + str(file_index) + ".pkl"
  with open(filename, 'wb') as fp:
    torch.save(batch_temp, fp)

print(str(counter), " pairs in total")


10000 	 5.704886988798777
20000 	 5.222293734550476
30000 	 5.351386781533559
40000 	 5.18360896507899 	 vg_feat_cap_0.pkl


RuntimeError: ignored

| ks | 10 | 20 | 30 | 40 |
| -- | -- | -- | -- | -- |
|mins|5.70|5.22|5.35|5.18|


7 mins/10k, 30 mins/dump

In [None]:
# #To load from pickle file
# data = []
# with open(filename, 'rb') as fr:
#   try:
#     while True:
#       data.append(pickle.load(fr))
#   except EOFError:
#       pass

# print(data)

with open('/content/vg_feat_cap_0.pkl', 'rb') as f:
  data = torch.load(f, map_location=device)
for i, j in data:
  print(i.shape, j.shape)
  break

torch.Size([1, 256]) torch.Size([12])
torch.Size([1, 256]) torch.Size([12])
torch.Size([1, 256]) torch.Size([12])
torch.Size([1, 256]) torch.Size([12])
