In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
# !cp /content/gdrive/MyDrive/PPlusPlus/models/word_model.py /content/word_model.py

Mounted at /content/gdrive


In [None]:
import torch.nn as nn
import torch
import time

embed_size=256
hidden_size=512
num_layers=1
vocab_size=4987 
# https://www.youtube.com/watch?v=y2BaTt1fxJU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class DecoderRNN(nn.Module):
  def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
    """Set the hyper-parameters and build the layers."""
    super(DecoderRNN, self).__init__()
    # self.cat_embedding = nn.Embedding(n_cat, cat_embedding_size)  # (18, 32)
    self.embed = nn.Embedding(vocab_size, embed_size) 
    # output, (h_n, c_n), input_size = cat_embedding_size+char_embedding_size (32+32=64)
    self.lstm = nn.LSTM(embed_size*2, hidden_size, num_layers, batch_first=True)
    self.linear = nn.Linear(hidden_size, vocab_size) 
    self.init_weights()
  
  def init_weights(self):
    """Initialize weights."""
    self.embed.weight.data.uniform_(-0.1, 0.1)
    self.linear.weight.data.uniform_(-0.1, 0.1)
    self.linear.bias.data.fill_(0)
      
  def forward(self, features, captions):
    """
    Decode image feature vectors and generates captions.
    features: vector
    captions: tensor
    lengths: hidden?
    """
    # captions = captions.clone().detach().long() # Tensor: (12,)
    # print('captions: ', captions)  # device='cuda:0'
    embeddings = self.embed(captions)  # Tensor: (12, 256)
    # print('features.shape before cat: ', features.shape)  # 
    # print('embeddings.shape before cat: ', embeddings.shape)  # torch.Size([12, 256])
    embeddings = torch.cat((features.repeat(12, 1), embeddings), 1).to(device)  # (12, 512)
    # print('embeddings.shape fed to lstm: ', embeddings.shape)  # torch.Size([12, 512])
    # output, (hidden, cell) = self.lstm(torch.concat([cat_emb, char_emb], dim=1))
    # Defaults to zeros if (h_0, c_0) is not provided.
    output, _ = self.lstm(embeddings)
    # print('output shape: ', output.shape)  # torch.Size([12, 512])
    predictions = self.linear(output)
    # print('predictions shape: ', predictions.shape)  # torch.Size([12, 4987])
    return torch.nn.functional.log_softmax(predictions, dim=1)

  def sample(self, features, states=None):
    """Samples captions for given image features (Greedy search)."""
    sampled_ids = []
    inputs = features.unsqueeze(1)
    with open("data/vocab.pkl", 'rb') as f:
      vocab = pickle.load(f)  # a Vocabulary() from utils/build_vocab.py
    for i in range(20):                                      # maximum sampling length
      hiddens, states = self.lstm(inputs, states)          # (batch_size, 1, hidden_size), 

      # print(hiddens.size())
      # print(states[0].size(),states[1].size())

      outputs = self.linear(hiddens.squeeze(1))            # (batch_size, vocab_size)
      predicted = outputs.max(1)[1]

      # print("stuff",type(predicted.data),predicted.data)
      # print(vocab.idx2word[1])
      # print("\nNNASDFKLASDJF\n\n",vocab.idx2word[predicted.data.cpu().numpy()[0]])

      sampled_ids.append(predicted)
      inputs = self.embed(predicted)
      inputs = inputs.unsqueeze(1)                         # (batch_size, 1, embed_size)

    # print("SAMPLED IDS",sampled_ids.size())
    sampled_ids = torch.cat(sampled_ids, 0)                  # (batch_size, 20)
    return sampled_ids.squeeze()

  def init_hidden(self):
    return torch.zeros(1, hidden_size)


def train(features, captions):
  # get a fresh hidden layer
  # hidden = lstm.initHidden()
  # zero the gradients
  optimizer.zero_grad()
  # run sequence;  def forward(self, features, captions)
  predictions = lstm(features, captions)
  # compute loss (NLLH)
  loss = criterion(predictions[:-1], captions[1:len(captions)])
  # perform backward pass
  loss.backward()
  # perform optimization
  optimizer.step()
  # return prediction and loss
  return loss.item()

define dataset and dataloader following [this video](https://www.youtube.com/watch?v=PXOzkkB5eH0)

In [None]:
from torch.utils.data import Dataset, DataLoader
import pickle

class ModelDataset(Dataset):
  def __init__(self, file_path):
    # data loading
    with open(file_path, 'rb') as f:
      self.data = pickle.load(f)
    self.n_samples = len(self.data)
  
  def __getitem__(self, index):
    # return a pair of feature and captions, allowing indexing
    return self.data[index][0], self.data[index][1]
  
  def __len__(self):
    return self.n_samples

mydataset = ModelDataset("/content/gdrive/MyDrive/PPlusPlus/vg_data/vg_feat_cap_0.pkl")
dataloader = DataLoader(dataset = mydataset, batch_size=1, num_workers=0, shuffle=True)


In [None]:
# feature: tensor (1, 256); caption: tensor(12,)
i = 0
for feature, caption in dataloader:
  print(feature[0].shape, "\n", caption[0], "\n")
  i += 1
  if i > 1: break
# torch.Size([1, 256]) 
# tensor([ 1, 10,  6, 11,  4, 12,  2,  0,  0,  0,  0,  0]) 
# caption = torch.as_tensor(caption).to(device)
# print(caption)

torch.Size([1, 256]) 
 tensor([  1,  25, 406, 751,  55,   2,   0,   0,   0,   0,   0,   0]) 

torch.Size([1, 256]) 
 tensor([   1,    4, 1063,    6,    7,   55,    2,    0,    0,    0,    0,    0]) 



continue train() model from epoch 26

In [None]:
# load pretrained model and continue training
# model = TheModelClass(*args, **kwargs)
# model.load_state_dict(torch.load(PATH))
# model.eval()
lstm = DecoderRNN(embed_size=embed_size, hidden_size=hidden_size, 
                  vocab_size=vocab_size, num_layers=num_layers).to(device)
lstm.load_state_dict(torch.load('/content/epoch25'))
lstm.eval()
lstm.train()
criterion = nn.NLLLoss(reduction='sum')
# learning rate
learning_rate = 0.0005
# optimizer
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
# training parameters
n_epoch = 25 
all_losses = []
total_loss = 0 

start = time.time()
print("epochs\tloss\t\t\ttime(s)")
for epoch in range(26, n_epoch+26):
  for feature, captions in dataloader:
    # feature = feature.to(device)  # .to(device) is redundant
    captions = captions.to(device)
    loss = train(feature[0], captions[0])
    total_loss += loss

  print(epoch, "\t", total_loss, "\t", time.time() - start)  # 5.8m/epoch
  start = time.time()
  total_loss = 0

torch.save(lstm.state_dict(), '/content/vg_word_decoder_50.pkl')  
# vg-decoder-5-3000.pkl, 6252kb

epochs	loss			time(s)
26 	 196406.3999903676 	 177.18793272972107


In [None]:
# train without loading pre-trained model
lstm = DecoderRNN(embed_size=embed_size, hidden_size=hidden_size, 
                  vocab_size=vocab_size, num_layers=num_layers).to(device)
criterion = nn.NLLLoss(reduction='sum')
# learning rate
learning_rate = 0.0005
# optimizer
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
# training parameters
n_epoch = 25
# print_every = 5
all_losses = []
total_loss = 0 

start = time.time()
print("epochs\tloss\t\t\ttime(s)")
for epoch in range(1, n_epoch+1):
  for feature, captions in dataloader:
    # feature = feature.to(device)  # .to(device) redundant
    captions = captions.to(device)
    loss = train(feature[0], captions[0])
    total_loss += loss

  print(epoch, "\t", total_loss, "\t", time.time() - start)  # 5.8m/epoch
  start = time.time()
  total_loss = 0
  # if epoch % print_every == 0:  # 850s(14.2m) for each 5 epoch
  #   # all_losses.append(total_loss / print_every)
  #   print(epoch, "\t", total_loss, "\t", time.time() - start)
  #   start = time.time()
  #   total_loss = 0
# save: torch.save(model.state_dict(), PATH)  
# torch.save(model, PATH)
torch.save(lstm.state_dict(), '/content/vg_word_decoder_25.pkl')
# torch.save(lstm, '/content/test_save')  # test_save.py: 22.65mb 

epochs	loss			time(s)
1 	 738013.8100529313 	 182.07085394859314
2 	 615120.0524843037 	 178.58732175827026
3 	 566254.86066176 	 174.16399240493774
4 	 523824.8683729768 	 173.6789710521698
5 	 488836.4067925513 	 173.39287161827087


# NOTES


In [None]:
import torch
feat = torch.zeros(1, 256)
feat.repeat(12, 1).shap

torch.Size([12, 256])