In [4]:
!wget https://cs.stanford.edu/people/karpathy/deepimagesent/flickr8k.zip

--2021-11-17 19:49:27--  https://cs.stanford.edu/people/karpathy/deepimagesent/flickr8k.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 50286882 (48M) [application/zip]
Saving to: ‘flickr8k.zip’


2021-11-17 19:49:31 (12.9 MB/s) - ‘flickr8k.zip’ saved [50286882/50286882]



In [5]:
!unzip flickr8k.zip

Archive:  flickr8k.zip
  inflating: flickr8k/dataset.json   
  inflating: flickr8k/readme.txt     
  inflating: flickr8k/vgg_feats.mat  


In [6]:
!mkdir train
!mkdir val

In [2]:
import torch
import numpy as np
import torchvision.models.vgg as models
import torchvision.transforms as transforms
from PIL import Image
import json
import pickle
import scipy.io

###Preprocess

In [7]:


dataset_path = 'flickr8k'

#Images
image_data = scipy.io.loadmat(dataset_path + '/vgg_feats.mat')
images = image_data['feats']

#Captions
with open(dataset_path + '/dataset.json') as f:
	caption_data = json.load(f)['images']

S = images.shape[1]
T = int(S * 0.9)
V = S - T

train_imgs = np.zeros((T, 4096))
train_caps = []
train_names = []

val_imgs = np.zeros((V, 4096))
val_caps = []
val_names = []

for image_id in range(images.shape[1]):
    if image_id < T:
        train_imgs[image_id] = images[:, image_id]
        train_names.append(caption_data[image_id]['filename'])
        for i in range(5):
            train_caps.append(caption_data[image_id]['sentences'][i]['raw'])
    else:
        val_imgs[image_id - T] = images[:, image_id]
        val_names.append(caption_data[image_id - T]['filename'])
        for i in range(5):
            val_caps.append(caption_data[image_id - T]['sentences'][i]['raw'])

print('Train Images shape:', train_imgs.shape, 'Train Length of captions:', len(train_caps))
print('Train Images shape:', val_imgs.shape, 'Train Length of captions:', len(val_caps))

# Saving Train
with open('train/image_features.pkl', 'wb') as f:
	pickle.dump(train_imgs, f)
with open('train/captions.pkl', 'wb') as f:
	pickle.dump(train_caps, f)
with open('train/file_names.pkl', 'wb') as f:
	pickle.dump(train_names, f)

# Saving Val
with open('val/image_features.pkl', 'wb') as f:
	pickle.dump(val_imgs, f)
with open('val/captions.pkl', 'wb') as f:
	pickle.dump(val_caps, f)
with open('val/file_names.pkl', 'wb') as f:
	pickle.dump(val_names, f)

print('Saved')


Train Images shape: (7200, 4096) Train Length of captions: 36000
Train Images shape: (800, 4096) Train Length of captions: 4000
Saved


###Utilitary

In [18]:
import torch
import numpy as np
import string
from torch import nn


def tokenize(text):
	table = str.maketrans('', '', string.punctuation)
	# tokenize
	desc = text.split()
	# to lower
	desc = [word.lower() for word in desc]
	# remove punctuation
	desc = [word.translate(table) for word in desc]
	# remove words less in len
	desc = [word for word in desc if len(word) > 1]
	# remove numbers
	desc = [word for word in desc if word.isalpha()]
	return desc


def build_dictionary(text):
    """
    Build a dictionary (mapping of tokens to indices)
    text: list of sentences (pre-tokenized)
    """
    wordcount = {}
    for cc in text:
        words = tokenize(cc)
        for word in words:
            if word not in wordcount:
                wordcount[word] = 0
            wordcount[word] += 1
		# print(words)

    words = list(wordcount.keys())
    freqs = list(wordcount.values())
    sorted_idx = np.argsort(freqs)[::-1]

    worddict = {}
    for idx, sidx in enumerate(sorted_idx):
        worddict[words[sidx]] = idx+2  # 0: <eos>, 1: <unk>

    return worddict

def get_hot(cap, worddict):
	x = np.zeros((len(cap.split())+1, len(worddict)+2))

	r = 0
	for w in cap.split():
		if w in worddict:
			x[r, worddict[w]] = 1
		else:
			# Unknown word/character
			x[r, 1] = 1
		r += 1
	# EOS
	x[r, 0] = 1

	return torch.from_numpy(x).float().cuda()


###Model

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

# Image Encoder
class ImageEncoder(nn.Module):

	def __init__(self, EMBEDDING_SIZE, COMMON_SIZE):
		super(ImageEncoder, self).__init__()
		self.linear = nn.Linear(EMBEDDING_SIZE, COMMON_SIZE)

	def forward(self, x):
		return self.linear(x).abs()

class SentencesEncoder(nn.Module):

	def __init__(self, VOCAB_SIZE, WORD_EMBEDDING_SIZE, COMMON_SIZE):
		super(SentencesEncoder, self).__init__()
		self.embed = nn.Linear(VOCAB_SIZE, WORD_EMBEDDING_SIZE)
		self.encoder = nn.GRU(WORD_EMBEDDING_SIZE, COMMON_SIZE)

	def forward(self, x):
		x = self.embed(x)
		o, h = self.encoder(x.reshape(x.shape[0], 1, x.shape[1]))
		return h.reshape(1, -1).abs()

###Training

In [30]:
dataset_path = 'flickr8k'

# Loading the dataset
print('Loading the train dataset')
with open('train/image_features.pkl', 'rb') as f:
	train_images = pickle.load(f)
	train_images = train_images.astype(np.float32)
	train_images = train_images / torch.norm(torch.from_numpy(train_images), dim=1, p=2).reshape(-1, 1)

with open('train/captions.pkl', 'rb') as f:
	train_caps = pickle.load(f)
	
print('Images shape:', train_images.shape, 'Length of captions:', len(train_caps))

print('Loading the val dataset')
with open('val/image_features.pkl', 'rb') as f:
	val_images = pickle.load(f)
	val_images = val_images.astype(np.float32)
	val_images = val_images / torch.norm(torch.from_numpy(val_images), dim=1, p=2).reshape(-1, 1)

with open('val/captions.pkl', 'rb') as f:
	val_caps = pickle.load(f)

print('Val Images shape:', val_images.shape, 'Length of Val captions:', len(val_caps))

# Creating dictionary and saving
print('Creating the word dictionary')
worddict = build_dictionary(train_caps)
with open('worddict.pkl', 'wb') as f:
	pickle.dump(worddict, f)
print('Dictionary size:', len(worddict))

Loading the train dataset
Images shape: torch.Size([7200, 4096]) Length of captions: 36000
Loading the val dataset
Val Images shape: torch.Size([800, 4096]) Length of Val captions: 4000
Creating the word dictionary
Dictionary size: 8267


In [31]:
def Score(caps, imgs):
	z = torch.zeros(caps.shape).cuda()
	return -torch.sum(torch.max(z, caps-imgs)**2, dim=1).cuda()

def triplet_loss_img(anchor, positive, negative, margin):
	ps = Score(positive, anchor)
	pn = Score(negative, anchor)
	z = torch.zeros(ps.shape).cuda()
	return torch.sum(torch.max(z, margin - ps + pn))

def triplet_loss_cap(anchor, positive, negative, margin):
	ps = Score(anchor, positive)
	pn = Score(anchor, negative)
	z = torch.zeros(ps.shape).cuda()
	return torch.sum(torch.max(z, margin - ps + pn))

In [40]:
import torch
from torch import nn
import numpy as np
from torch import optim
import torchvision.models.vgg as models
import torchvision.transforms as transforms
from PIL import Image
import pickle
import json

# Parameters
margin = 0.05
max_epochs = 10
dim_image = 4096
batch_size = 256
dim = 1024
dim_word = 300
lrate = 0.001

# Loading models
ImgEncoder = ImageEncoder(dim_image, dim).cuda()
SentenceEncoder = SentencesEncoder(len(worddict)+2, dim_word, dim).cuda()
print('Models loaded')

# Adam Optimizer
optimizer = optim.Adam(list(ImgEncoder.parameters()) + list(SentenceEncoder.parameters()), lr = lrate)
print('Loaded Adam optimizer')

Models loaded
Loaded Adam optimizer


In [35]:
# Training
print('Training begins')
epochLoss = []
meanRank = []
for epoch in range(max_epochs):
	
	print('Epoch:', epoch)
	totalLoss = 0

	for batch_index in range(0, train_images.shape[0], batch_size):
		
		if batch_index + batch_size >= train_images.shape[0]:
			break

		# Data preproc
		curr_ims = train_images[batch_index:batch_index+batch_size]
		all5_caps = train_caps[5*batch_index:5*(batch_index+batch_size)]
		curr_caps = []
		for i in range(batch_size):
			curr_caps.append(all5_caps[5*i + np.random.randint(0, 5)])

		one_hot_caps = []
		for i in range(batch_size):
			one_hot_caps.append(get_hot(curr_caps[i], worddict))

		# Encoding
		encoded_ims = ImgEncoder(curr_ims.cuda()).cuda()
		encoded_caps = []
		for i in range(batch_size):
			encoded_caps.append(SentenceEncoder(one_hot_caps[i]))
		encoded_caps = torch.stack(encoded_caps).reshape(batch_size, dim)

		# Real training
		optimizer.zero_grad()

		# Calculating Loss
		loss = 0
		for i in range(batch_size):
			# Image as anchor
			anchor = encoded_ims[i:i+1].repeat(batch_size - 1, 1)
			positive = encoded_caps[i:i+1].repeat(batch_size - 1, 1)
			negative = torch.cat((encoded_caps[:i], encoded_caps[i+1:]), 0)
			loss += triplet_loss_img(anchor, positive, negative, margin)

			# Caption as anchor
			anchor = encoded_caps[i:i+1].repeat(batch_size - 1, 1)
			positive = encoded_ims[i:i+1].repeat(batch_size - 1, 1)
			negative = torch.cat((encoded_ims[:i], encoded_ims[i+1:]), 0)
			loss += triplet_loss_cap(anchor, positive, negative, margin)

		# Logging
		totalLoss += loss.item()
		print('Samples seen: ' + str(batch_index+batch_size) +  '/' + str(train_images.shape[0]), 'loss:', loss.item())

		# Updating weights
		loss.backward()
		optimizer.step()

	# Logging for early stopping
	print('Training loss:', totalLoss)
	epochLoss.append(totalLoss)

	# Ranks on test set
	r = []
	encoded_val_ims = ImgEncoder(val_images.cuda()).cuda()
	for i in range(len(val_caps)):
		hot = get_hot(val_caps[i], worddict)
		encoded_val_cap = SentenceEncoder(hot).repeat(val_images.shape[0], 1)
		S = Score(encoded_val_cap, encoded_val_ims)
		ranks = S.argsort().cpu().numpy()[::-1]
		r.append(np.where(ranks==i//5)[0][0] + 1)
	
	print('Mean rank on val set: ' + str(np.mean(np.array(r))) + '/' + str(val_images.shape[0]))
	meanRank.append(np.mean(np.array(r)))

# Saving models
print("Training Completed!")
print('Loss over epochs')
print(epochLoss)
print('Mean rank over epochs')
print(meanRank)
torch.save(ImgEncoder.state_dict(), 'ImgEncoder.pt')
torch.save(SentenceEncoder.state_dict(), 'SentenceEncoder.pt')

Training begins
Epoch: 0
Samples seen: 256/7200 loss: 3746.064697265625
Samples seen: 512/7200 loss: 3405.545654296875
Samples seen: 768/7200 loss: 3528.529541015625
Samples seen: 1024/7200 loss: 3465.2353515625
Samples seen: 1280/7200 loss: 3222.9375
Samples seen: 1536/7200 loss: 3286.033447265625
Samples seen: 1792/7200 loss: 3470.75732421875
Samples seen: 2048/7200 loss: 3384.11474609375
Samples seen: 2304/7200 loss: 3164.72412109375
Samples seen: 2560/7200 loss: 3148.33251953125
Samples seen: 2816/7200 loss: 3005.796142578125
Samples seen: 3072/7200 loss: 3014.385009765625
Samples seen: 3328/7200 loss: 2919.254638671875
Samples seen: 3584/7200 loss: 2896.220703125
Samples seen: 3840/7200 loss: 2408.854248046875
Samples seen: 4096/7200 loss: 2736.608642578125
Samples seen: 4352/7200 loss: 2708.688720703125
Samples seen: 4608/7200 loss: 2889.524658203125
Samples seen: 4864/7200 loss: 2555.721435546875
Samples seen: 5120/7200 loss: 3056.409423828125
Samples seen: 5376/7200 loss: 2668.

###Load model

In [None]:
with open('worddict.pkl', 'rb') as f:
	worddict = pickle.load(f)
print('Loaded dictionary')

# Loading trained models
ImgEncoder = ImgEncoder(dim_image, dim).cuda()
ImgEncoder.load_state_dict(torch.load('ImgEncoder.pt'))
SentenceEncoder = SentenceEncoder(len(worddict)+2, dim_word, dim).cuda()
SentenceEncoder.load_state_dict(torch.load('SentenceEncoder.pt'))
print('Models loaded')