## **Generating Captions for images using RNNs and CNNs**
### Data Set: Flickr8k - 8091 Training Samples
### Predictions:  Recursive Generation, Using the start symbol, the next word is predicted. In the next pass, uses both the start symbol and the word earlier predicted to predict the next word in the sentence until the end sequence is obtained.
### Model - VCG model for extracting the image features, then passed through subsequent Dense, Dropout and 2 layers of  LSTM. 
### Trained for 15 epochs. Can be trained furthermore, to obtain better results. 
### Uses categorical cross entropy as the loss function
### To see the results, change the directory in the code, and then upload the image, you want to generate the captions for(Note that the program won't work properly if the image comes from a different distribution). And call extract feature and generate_desc functions with the necessary params, as specified in the function call.


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical,plot_model
from keras.layers import Dense,Dropout,LSTM,GRU,Input,add,Embedding

Using TensorFlow backend.


In [0]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import os
import cv2

In [0]:
data_dir = 'drive/Image Captioner/'

In [0]:
from keras.preprocessing.image import load_img,img_to_array

## No need to run these statements. Result is in a pickle file already

#### Load the Images

In [0]:
def load_images(directory):
    images = dict()
    for img in os.listdir(directory):
        filename = directory + img
        image = load_img(filename,target_size=(224,224))
        image = img_to_array(image)
        image = image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
        name = img.split('.')[0]
        images[name] = image
    return images

In [0]:
directory = data_dir + 'Flicker8k_Dataset/'

In [0]:
image_data = load_images(directory=directory)

In [0]:
type(image_data)

dict

In [0]:
print('the length of the image file is {}'.format(len(image_data)))

the length of the image file is 8091


### Defining the VGG Model for obtaining features from Images

In [0]:
from keras.models import Model,load_model
from keras.layers import Dense,Dropout
from keras.applications import VGG16
from pickle import dump,load

In [0]:
model = VGG16()
model.layers.pop() # Removes the last activation layer
model = Model(inputs = model.inputs,outputs = model.layers[-1].output)

In [0]:
features = dict()
for key,image in image_data.items():
    feature = model.predict(image,verbose = 1)
    features[key] = feature


In [0]:
from pickle import dump
dump(features, open('features.pkl', 'wb'))

## Begin From Here

### Preprocessing the Text Data

In [0]:
data = data_dir + 'Flickr8k_text/Flickr8k.token.txt'

In [0]:
file = open(data,'r')

In [0]:
doc = file.read()

### The document consists of the image id followed by the no.of the image followed by the text description. There are 5 descriptions per image id

### Dictionary to hold the image id as the key and the list of descriptions as its values

In [0]:
descriptions = dict()

In [0]:
i = 0
for line in doc.split('\n'):
    # split line by white space
    tokens = line.split()
    #print(tokens)
    #Extract the image id, to use it as key for the dictionary. Image Desc will be joined together to create a list
    image_id, image_desc = tokens[0], tokens[1:]
    #print(image_id)
    #print(image_desc)
    # extract filename from image id
    image_id = image_id.split('.')[0]

    # convert description tokens back to string
    image_desc = ' '.join(image_desc)
    if image_id not in descriptions:
        descriptions[image_id] = list()
    descriptions[image_id].append(image_desc)
    
    

In [0]:
len(descriptions)

In [0]:
descriptions['101654506_8eb26cfb60']

### We have 5 lines of description for each image file. We need to clean the descriptions now

In [0]:
import string
## We need to remove the punctuation marks first
table = str.maketrans('','',string.punctuation)

In [0]:
for key,desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        # Tokenize each single word in the list
        desc = desc.split()
        # Convert each word to its lower case format
        desc = [word.lower() for word in desc]
        # Remove punctuation from each word in the list
        desc = [word.translate(table) for word in desc]
        # Remove the tokens which have length = 1
        desc = [word for word in desc if len(word) > 1]
        # Remove tokens with numbers inside them
        desc = [word for word in desc if word.isalpha()]
        # Store it as a string
        desc_list[i] = ' '.join(desc)
        

In [0]:
descriptions['101654506_8eb26cfb60']

['brown and white dog is running through the snow',
 'dog is running in the snow',
 'dog running through snow',
 'white and brown dog is running through snow covered field',
 'the white and brown dog is running over the surface of the snow']

#### Creating a vocabulary of all the unique words present in the descriptions

In [0]:
vocab = set()
for key in descriptions.keys():
    [vocab.update(d.split()) for d in descriptions[key]]

In [0]:
len(vocab)

8763

In [0]:
# There are around 8763 unique words in our vocab

In [0]:
def save_desc(descriptions,filename):
    lines = list()
    for key,desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '-' + desc)
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [0]:
save_desc(descriptions,'descriptions.txt')

## Load Data

In [0]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

In [0]:
def load_set(filename):
    doc = load_doc(filename)
    last = doc[-1]
    dataset = list()
    # Process the data line by line
    for line in doc.split('\n'):
        if len(line) >= 1: # If it is not the last line
            identifier = line.split('.')[0]
            dataset.append(identifier)
    return set(dataset)
        

In [0]:
def load_cleaned_desc(filename,dataset):
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split('-')
        image_id,image_desc = tokens[0],tokens[1:]
        
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = list()
            desc = 'start ' + ' '.join(image_desc) + ' end'
            descriptions[image_id].append(desc)
    return descriptions

In [0]:
from pickle import load
def load_image_features(filename,dataset):
    all_features = load(open(filename,'rb'))
    features = {k:all_features[k] for k in dataset}
    return features

In [0]:
filename = data_dir + 'Flickr8k_text/Flickr_8k.trainImages.txt'

In [0]:
train = load_set(filename)

In [0]:
print('The length of the dataset is {}'.format(len(train)))

The length of the dataset is 6000


In [0]:
### Load the descriptions

In [0]:
desc = load_cleaned_desc('descriptions.txt',train)

In [0]:
print('Descriptions Length - {}'.format(len(desc)))

Descriptions Length - 6000


In [0]:
images = load_image_features(data_dir + 'features.pkl',train)

In [0]:
print('Images length - {}'.format(len(images)))

Images length - 6000


In [0]:
id = '1022454332_6af2c1449a'

In [0]:
type(desc)

dict

### So, now we have a training data of 6000 images and captions

### Prepare the Data for the embedding layer

#### First, we need to convert words into integer tokens. Then, from the tokens, we have to create the sequences that would be fed into the model

In [0]:
# Create a list of all captions

In [0]:
def to_lines(descriptions):
    all_desc = list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [0]:
# Fit a tokenizer to this list
def create_tokenizer(descriptions):
    lines = to_lines(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [0]:
tokenizer = create_tokenizer(desc)

In [0]:
vocab_size = len(tokenizer.word_index) + 1

In [0]:
vocab_size

7577

In [0]:
# Next, we have to create sequences of fixed length

In [0]:
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo):
	X1, X2, y = list(), list(), list()
	# walk through each description for the image
	for desc in desc_list:
		# encode the sequence
		seq = tokenizer.texts_to_sequences([desc])[0]
		# split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# store
			X1.append(photo)
			X2.append(in_seq)
			y.append(out_seq)
	return np.array(X1), np.array(X2), np.array(y)

In [0]:
### Generator fn to progressively create sequences

In [0]:
len(descriptions)

8092

In [0]:
def data_generator(descriptions, photos, tokenizer, max_length):
    # loop for ever over images
    while True:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
            yield [[in_img, in_seq], out_word] 

In [0]:
### Define the max length to be passed onto the create_sequences function

In [0]:
# calculate the length of the description with the most words
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [0]:
max_length = max_length(descriptions)

### Define the model

In [0]:
def def_model(vocab_size,max_length):
    # Extract features from the VGG Model
    inputs1 = Input(shape = (4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256,activation = 'relu')(fe1)
   
    
    # Sequence Model
    inputs2 = Input(shape = (max_length,))
    seq1 = Embedding(vocab_size,256,mask_zero = True)(inputs2)
    seq2 = Dropout(0.5)(seq1)
    seq3 = LSTM(256,return_sequences=True)(seq2)
    seq4 = Dropout(0.5)(seq3)
    seq5 = LSTM(256)(seq4)
    
    
    
    # Decoder model: Take the inputs from the images, and the sequences
    decoder1 = add([fe2,seq5])
    decoder2 = Dense(256,activation = 'relu')(decoder1)
    outputs = Dense(vocab_size,activation = 'softmax')(decoder2)
    
    # Combined Model
    m = Model(inputs = [inputs1,inputs2],outputs = outputs)
    m.compile(loss = 'categorical_crossentropy',optimizer = 'adam')
    
    # Plot the model
    #plot_model(model,to_file = 'model.png',show_shapes = True)
    return m
    

#### Define the checkpoint callback

In [0]:
from tensorflow.python.keras.callbacks import ModelCheckpoint

In [0]:
epochs = 10
steps = len(desc)
model = def_model(vocab_size,max_length)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
for i in range(epochs):
  print('Epoch number ',i+1)
  generator = data_generator(desc,images,tokenizer,max_length)
  model.fit_generator(generator,epochs = 1,steps_per_epoch = steps,verbose = 1)
  model.save(data_dir + 'model_' + str(i) + '.h5')

Epoch number  1
Instructions for updating:
Use tf.cast instead.
Epoch 1/1
Epoch number  2
Epoch 1/1
Epoch number  3
Epoch 1/1
Epoch number  4
Epoch 1/1
Epoch number  5
Epoch 1/1
Epoch number  6
Epoch 1/1
Epoch number  7
Epoch 1/1
Epoch number  8
Epoch 1/1
Epoch number  9
Epoch 1/1
Epoch number  10
Epoch 1/1


**Evaluate the Model**

---



In [0]:
# Function to map an integer to a word

In [0]:
def word_for_id(num, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == num:
			return word
	return None

### Training it for 5 more epochs

In [0]:
m = load_model(data_dir + 'model_9.h5')

In [0]:
for i in range(5):
  print('Epoch number ',i+11)
  generator = data_generator(desc,images,tokenizer,max_length)
  m.fit_generator(generator,epochs = 1,steps_per_epoch = steps,verbose = 1)
  m.save(data_dir + 'model_' + str(i + 10) + '.h5')
  

Epoch number  11
Epoch 1/1
Epoch number  12
Epoch 1/1
Epoch number  13
Epoch 1/1
Epoch number  14
Epoch 1/1
Epoch number  15
Epoch 1/1


**Function to recursively generate descriptions for a given image**

---



In [0]:
def generate_desc(model, tokenizer, photo, max_length):
	# seed the generation process
	in_text = 'start'
	# iterate over the whole length of the sequence
	for i in range(max_length):
		# integer encode input sequence
		sequence = tokenizer.texts_to_sequences([in_text])[0]
		# pad input
		sequence = pad_sequences([sequence], maxlen=max_length)
		# predict next word
		yhat = model.predict([photo,sequence], verbose=0)
		# convert probability to integer
		yhat = np.argmax(yhat)
		# map integer to word
		word = word_for_id(yhat, tokenizer)
		# stop if we cannot map the word
		if word is None:
			break
		# append as input for generating the next word
		in_text += ' ' + word
		# stop if we predict the end of the sequence
		if word == 'end':
			break
	return in_text
    
    
    

## Evaluate the Model using Bleu Score

In [0]:
from nltk.translate.bleu_score import corpus_bleu
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
	actual, predicted = list(), list()
	# step over the whole set
	for key, desc_list in descriptions.items():
		# generate description
		yhat = generate_desc(model, tokenizer, photos[key], max_length)
		# store actual and predicted
		references = [d.split() for d in desc_list]
		actual.append(references)
		predicted.append(yhat.split())
	# calculate BLEU score
	print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
	print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
	print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
	print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
    

In [0]:
filename = data_dir + 'Flickr8k_text/Flickr_8k.testImages.txt'

In [0]:
test_data = load_set(filename)
print('Length of the dataset is {}'.format(len(test_data)))

test_desc = load_cleaned_desc('descriptions.txt',test_data)
print('Length of the descriptions is {}'.format(len(test_desc)))

test_features = load_image_features(data_dir + 'features.pkl',test_data)
print('Length of the images is {}'.format(len(test_features)))



Length of the dataset is 1000
Length of the descriptions is 1000
Length of the images is 1000


#### Load the model

In [0]:
filename = data_dir + 'model_14.h5'
model = load_model(filename)

In [0]:
evaluate_model(model,test_desc,test_features,tokenizer,max_length)

BLEU-1: 0.537706
BLEU-2: 0.278727
BLEU-3: 0.176075
BLEU-4: 0.072834


## Generate New captions

In [0]:
from keras.applications.vgg16 import preprocess_input
def extract_feature(filename):
  model = VGG16()
  model.layers.pop()
  model = Model(inputs = model.inputs,outputs = model.layers[-1].output)
  img = load_img(filename,target_size=(224,224))
  img = img_to_array(img)
  img = img.reshape((1,img.shape[0],img.shape[1],img.shape[2]))
  img = preprocess_input(img)
  feature = model.predict(img)
  return [img,feature]

In [0]:
tokenizer = create_tokenizer(desc)

In [0]:
dump(tokenizer,open('tokenizer.pkl','wb'))

In [0]:
tokenizer = load(open('tokenizer.pkl','rb'))

In [0]:
img,photo_dog = extract_feature(data_dir + 'sample.jpg')
print(generate_desc(model,tokenizer,photo_dog,max_length))

start dog is running through the grass end


## Alternative Model - Yet to train. Uses GRU instead of LSTM 

In [0]:
def def_model(vocab_size,max_length):
    # Extract features from the VGG Model
    inputs1 = Input(shape = (4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256,activation = 'relu')(fe1)
   
    
    # Sequence Model
    inputs2 = Input(shape = (max_length,))
    seq1 = Embedding(vocab_size,256,mask_zero = True)(inputs2)
    seq2 = Dropout(0.5)(seq1)
    seq3 = GRU(256,return_sequences=True)(seq2)
    seq4 = Dropout(0.5)(seq3)
    seq5 = GRU(256)(seq4)
    
    
    
    # Decoder model: Take the inputs from the images, and the sequences
    decoder1 = add([fe2,seq5])
    decoder2 = Dense(256,activation = 'relu')(decoder1)
    outputs = Dense(vocab_size,activation = 'softmax')(decoder2)
    
    # Combined Model
    m = Model(inputs = [inputs1,inputs2],outputs = outputs)
    m.compile(loss = 'categorical_crossentropy',optimizer = 'adam')
    
    # Plot the model
    #plot_model(model,to_file = 'model.png',show_shapes = True)
    return m