In [1]:
import string
import numpy as np
import os
from PIL import Image
from pickle import dump, load
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout, Input

# small library for seeing the progress of loops.
from tqdm import tqdm_notebook as tqdm
# tqdm().pandas()

Using TensorFlow backend.


In [2]:
# Loading a text file into memory
def load_doc(filename):
    # opening the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# get all images with their captions
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

# Data cleaning - lower casing, removing punctuations and words containing numbers
def cleaning_text(captions):
    table = str.maketrans('', '', string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            
            img_caption.replace('-', ' ')
            # splitting sentence to words
            desc = img_caption.split()
            
            # converts to lower case
            desc = [word.lower() for word in desc]
            
            # remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            
            # removing hanging 's and a
            desc = [word for word in desc if(len(word)>1)]
            
            # remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            
            #convert back to string
            img_caption = ' '.join(desc)
            captions[img][i] = img_caption
    return captions

# build vocabulary of all unique words
def text_vocabulary(descriptions):
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

# all descriptions in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
dataset_text = "D:\Practise\python\ml\Projects\Personel\Image Caption Generation\Flickr8k_text"
dataset_images = "D:\Practise\python\ml\Projects\Personel\Image Caption Generation\Flicker8k_Dataset"

In [4]:
# preparing text data
filename = dataset_text + "/" + "Flickr8k.token.txt"

descriptions = all_img_captions(filename)
print('Length of descriptions: ', len(descriptions))

Length of descriptions:  8092


In [5]:
type(descriptions)

dict

In [6]:
i = 0
for key, value in descriptions.items():
    i += 1
    print('Key: ', key)
    print('Value: ', value)
    print()
    if i > 2:
        break

Key:  1000268201_693b08cb0e.jpg
Value:  ['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']

Key:  1001773457_577c3a7d70.jpg
Value:  ['A black dog and a spotted dog are fighting', 'A black dog and a tri-colored dog playing with each other on the road .', 'A black dog and a white dog with brown spots are staring at each other in the street .', 'Two dogs of different breeds looking at each other on the road .', 'Two dogs on pavement moving toward each other .']

Key:  1002674143_1b742ab4b8.jpg
Value:  ['A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .', 'A little girl is sitting in front of a large painted rainbow .', 'A small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on i

In [7]:
#cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)

In [8]:
print(len(clean_descriptions))
print(type(clean_descriptions))

8092
<class 'dict'>


In [9]:
i = 0
for key, value in clean_descriptions.items():
    i += 1
    print('Key: ', key)
    print('Value: ', value)
    print()
    if i > 2:
        break

Key:  1000268201_693b08cb0e.jpg
Value:  ['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']

Key:  1001773457_577c3a7d70.jpg
Value:  ['black dog and spotted dog are fighting', 'black dog and tricolored dog playing with each other on the road', 'black dog and white dog with brown spots are staring at each other in the street', 'two dogs of different breeds looking at each other on the road', 'two dogs on pavement moving toward each other']

Key:  1002674143_1b742ab4b8.jpg
Value:  ['little girl covered in paint sits in front of painted rainbow with her hands in bowl', 'little girl is sitting in front of large painted rainbow', 'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it', 'there is girl with pigtails sitting in front of rainbow painting', '

In [10]:
#building vocabulary 
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

Length of vocabulary =  8763


In [11]:
type(vocabulary)

set

In [12]:
i = 0
for x in vocabulary:
    i += 1
    print('Set Item: ', x)
    print()
    if i > 2:
        break

Set Item:  stunning

Set Item:  halfburied

Set Item:  planks



In [13]:
#saving each description to file 
save_descriptions(clean_descriptions, "descriptions.txt")

## Extracting the feature vector from all images

In [14]:
def extract_features(directory):
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for img in os.listdir(directory):
        filename = directory + "/" + img
        image = Image.open(filename)
        image = image.resize((299,299))
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        
        feature = model.predict(image)
        features[img] = feature
    return features

In [15]:
# # 2048 feature vector
# features = extract_features(dataset_images)
# dump(features, open('features.p', 'wb'))

Commented the above cell because training is taking place at the above cell.
It took 1 hour for me to train

In [16]:
# loading features 
features = load(open('features.p', 'rb'))

# Loading dataset for Training the model

In [17]:
# load the photos for training
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split('\n')[:-1]
    return photos

# preparing training data(dictionary containing images(for images loaded above) and corresponding captions)
def load_clean_descriptions(filename, photos):
    # loading clean descriptions
    file = load_doc(filename)
    descriptions = {}
    
    for line in file.split('\n'):
        
        words = line.split()
        if len(words) < 1:
            continue
        
        image, image_caption = words[0], words[1:]
        
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            
            desc = '<start> ' + ' '.join(image_caption) + ' <end>'
            descriptions[image].append(desc)
    return descriptions

# loading features(trained above) for the images used in training
def load_features(photos):
    # loading all features
    all_features = load(open('features.p', 'rb'))
    # selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [18]:
filename = dataset_text + '/' + 'Flickr_8k.trainImages.txt'

train_imgs = load_photos(filename)

train_descriptions = load_clean_descriptions('descriptions.txt', train_imgs)

train_features = load_features(train_imgs)

In [19]:
print(type(train_imgs))
print(type(train_descriptions))
print(type(train_features))

<class 'list'>
<class 'dict'>
<class 'dict'>


# Tokenizing the vocabulary 

Computers don’t understand English words, for computers, we will have to represent them with numbers. So, we will map each word of the vocabulary with a unique index value. Keras library provides us with the tokenizer function that we will use to create tokens from our vocabulary and save them to a “tokenizer.p” pickle file.

In [20]:
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

#creating tokenizer class 
#this will vectorise text corpus
#each integer will represent token in dictionary

from keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [21]:
# # give each word an index, and store that into tokenizer.p pickle file

tokenizer = create_tokenizer(train_descriptions)
# dump(tokenizer, open('tokenizer.p', 'wb'))

in above cell dump commented because we write only once

The tokenizer object has the following attributes:
 - word_counts --- named list mapping words to the number of times they appeared on during fit. ...
 - word_docs --- named list mapping words to the number of documents/texts they appeared on during fit. ...
 - word_index --- named list mapping words to their rank/index (int).

In [22]:
type(tokenizer.word_counts)

collections.OrderedDict

In [23]:
len(tokenizer.word_counts)

7576

In [24]:
ex = tokenizer.word_counts
i = 0
for key, value in ex.items():
    print(f'(Word: {key}, Count: {value}) ')
    i += 1
    if i>5:
        break

(Word: start, Count: 30007) 
(Word: child, Count: 1120) 
(Word: in, Count: 14085) 
(Word: pink, Count: 543) 
(Word: dress, Count: 260) 
(Word: is, Count: 6907) 


In [25]:
type(tokenizer.word_docs)

collections.defaultdict

In [26]:
len(tokenizer.word_docs)

7576

In [27]:
ex = tokenizer.word_docs
i = 0
for key, value in ex.items():
    print(f'(Word: {key}, Document: {value}) ')
    i += 1
    if i>5:
        break

(Word: set, Document: 81) 
(Word: dress, Document: 258) 
(Word: in, Document: 12334) 
(Word: pink, Document: 529) 
(Word: up, Document: 899) 
(Word: stairs, Document: 81) 


In [28]:
type(tokenizer.word_index)

dict

In [29]:
len(tokenizer.word_index)

7576

In [30]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7577

In [31]:
ex = tokenizer.word_index
i = 0
for key, value in ex.items():
    print(f'(Word: {key}, Rank: {value}) ')
    i += 1
    if i>5:
        break

(Word: end, Rank: 1) 
(Word: start, Rank: 2) 
(Word: in, Rank: 3) 
(Word: the, Rank: 4) 
(Word: on, Rank: 5) 
(Word: is, Rank: 6) 


Our vocabulary contains 7577 words.

We calculate the maximum length of the descriptions. This is important for deciding the model structure parameters.

In [32]:
type(descriptions)

dict

In [33]:
i = 0
for key, value in descriptions.items():
    print(key, value)
    print()
    i += 1
    if i>2:
        break

1000268201_693b08cb0e.jpg ['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']

1001773457_577c3a7d70.jpg ['black dog and spotted dog are fighting', 'black dog and tricolored dog playing with each other on the road', 'black dog and white dog with brown spots are staring at each other in the street', 'two dogs of different breeds looking at each other on the road', 'two dogs on pavement moving toward each other']

1002674143_1b742ab4b8.jpg ['little girl covered in paint sits in front of painted rainbow with her hands in bowl', 'little girl is sitting in front of large painted rainbow', 'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it', 'there is girl with pigtails sitting in front of rainbow painting', 'young girl with pigtails painting outside 

In [34]:
#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

In [35]:
max_length = max_length(descriptions)
max_length

32

Max_length of description is 32.

## Create Data generator

Let us first see how the input and output of our model will look like. To make this task into a supervised learning task, we have to provide input and output to the model for training. We have to train our model on 6000 images and each image will contain 2048 length feature vector and caption is also represented as numbers. This amount of data for 6000 images is not possible to hold into memory so we will be using a generator method that will yield batches.

The generator will yield the input and output sequence.

### for Understanding purpose

In [36]:
print(type(train_descriptions))
print(type(features))
print(type(tokenizer))
print(type(max_length))

<class 'dict'>
<class 'dict'>
<class 'keras_preprocessing.text.Tokenizer'>
<class 'int'>


In [37]:
for key, description_list in descriptions.items():
    feature = features[key][0]
    print(type(feature))
    print(len(feature))
    print(feature)
    if True:
        break

<class 'numpy.ndarray'>
2048
[0.4734093  0.01730903 0.07334246 ... 0.08557963 0.02102303 0.23765516]


In [38]:
 demo_list = ['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']

In [39]:
for demo in demo_list:
    print(demo)

child in pink dress is climbing up set of stairs in an entry way
girl going into wooden building
little girl climbing into wooden playhouse
little girl climbing the stairs to her playhouse
little girl in pink dress going into wooden cabin


In [40]:
seq = tokenizer.texts_to_sequences([demo_list][0])

seq

[[42, 3, 87, 169, 6, 117, 55, 393, 11, 394, 3, 27, 4472, 639],
 [18, 313, 64, 195, 118],
 [39, 18, 117, 64, 195, 2055],
 [39, 18, 117, 4, 394, 19, 60, 2055],
 [39, 18, 3, 87, 169, 313, 64, 195, 2913]]

In [41]:
demo_seq = tokenizer.texts_to_sequences([demo_list][0])[0]

print(demo_list[0])
print(demo_seq)

child in pink dress is climbing up set of stairs in an entry way
[42, 3, 87, 169, 6, 117, 55, 393, 11, 394, 3, 27, 4472, 639]


The above is giving the rank of the words in sequence

In [42]:
for i in range(1, len(demo_seq)):
    demo_in_seq, demo_out_seq = demo_seq[:i], demo_seq[i]
    print(demo_in_seq, demo_out_seq)

[42] 3
[42, 3] 87
[42, 3, 87] 169
[42, 3, 87, 169] 6
[42, 3, 87, 169, 6] 117
[42, 3, 87, 169, 6, 117] 55
[42, 3, 87, 169, 6, 117, 55] 393
[42, 3, 87, 169, 6, 117, 55, 393] 11
[42, 3, 87, 169, 6, 117, 55, 393, 11] 394
[42, 3, 87, 169, 6, 117, 55, 393, 11, 394] 3
[42, 3, 87, 169, 6, 117, 55, 393, 11, 394, 3] 27
[42, 3, 87, 169, 6, 117, 55, 393, 11, 394, 3, 27] 4472
[42, 3, 87, 169, 6, 117, 55, 393, 11, 394, 3, 27, 4472] 639


In [43]:
for i in range(1, len(demo_seq)):
    demo_in_seq, demo_out_seq = demo_seq[:i], demo_seq[i]
    demo_in_seq = pad_sequences([demo_in_seq], maxlen = 32)
    print(demo_in_seq)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0 42]]
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0 42  3]]
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0 42  3 87]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0  42   3  87 169]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0  42   3  87 169   6]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0  42   3  87 169   6 117]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0  42   3  87 169   6 117  55]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0  42   3  87 169   6 117  55 393]]
[[  0   0   0   0   0   

In [44]:
for i in range(1, len(demo_seq)):
    demo_in_seq, demo_out_seq = demo_seq[:i], demo_seq[i]
    demo_in_seq = pad_sequences([demo_in_seq], maxlen = 32)[0]
    print(demo_in_seq)

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0 42]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0 42  3]
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0 42  3 87]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0  42   3  87 169]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  42   3  87 169   6]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0  42   3  87 169   6 117]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0  42   3  87 169   6 117  55]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0  42   3  87 169   6 117  55 393]
[  0   0   0   0   0   0   0   0   0   0   0   0

In [45]:
for i in range(1, len(demo_seq)):
    demo_in_seq, demo_out_seq = demo_seq[:i], demo_seq[i]
    demo_out_seq = to_categorical([demo_out_seq], num_classes = vocab_size)[0]
    print(demo_out_seq)
len(demo_out_seq)

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


7577

In [46]:
for i in range(1, len(demo_seq)):
    demo_in_seq, demo_out_seq = demo_seq[:i], demo_seq[i]
    demo_out_seq = to_categorical([demo_out_seq], num_classes = vocab_size)[0]
    print(demo_out_seq[:20])

[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


### end *for Understanding purpose

In [47]:
# create input-output sequence pairs from the image description.

#data generator, used by model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, desciption_list in descriptions.items():
            #retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, desciption_list, feature)
            yield [[input_image, input_sequence], output_word]

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [48]:
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))

In [49]:
print(a.shape)
print(b.shape)
print(c.shape)

(47, 2048)
(47, 32)
(47, 7577)


## Defining the CNN-RNN model

To define the structure of the model, we will be using the Keras Model from Functional API. It will consist of three major parts:

#### Feature Extractor 
 - The feature extracted from the image has a size of 2048, with a dense layer, we will reduce the dimensions to 256 nodes.

#### Sequence Processor
 - An embedding layer will handle the textual input, followed by the LSTM layer.

#### Decoder
 - By merging the output from the above two layers, we will process by the dense layer to make the final prediction. The final layer will contain the number of nodes equal to our vocabulary size.

In [53]:
from keras.utils import plot_model

# define the captioning model
def define_model(vocab_size, max_length):
    
    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    # Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summarize model
    print(model.summary())
#     commented due to unsolved errors
#     plot_model(model, to_file='model.png', show_shapes=True)
    
    return model

# Training the model

In [54]:
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7577
Description Length:  32


In [55]:
# model = define_model(vocab_size, max_length)
# epochs = 10
# steps = len(train_descriptions)

# # making a directory models to save our models
# os.mkdir('models')

# for i in range(epochs):
#     generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    
#     model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    
#     model.save('models/model_' + str(i) + '.h5')

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 32, 256)      1939712     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 2048)         0           input_3[0][0]                    
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


commented above code because traing will take place in above cell...
It took for me nearly 4 hours to train the model.


Results:
    
    
    Epoch 1/1
6000/6000 [==============================] - 1131s 188ms/step - loss: 4.4979
Epoch 1/1
6000/6000 [==============================] - 1878s 313ms/step - loss: 3.6421
Epoch 1/1
6000/6000 [==============================] - 1138s 190ms/step - loss: 3.3466
Epoch 1/1
6000/6000 [==============================] - 1121s 187ms/step - loss: 3.1677
Epoch 1/1
6000/6000 [==============================] - 1487s 248ms/step - loss: 3.0482
Epoch 1/1
6000/6000 [==============================] - 1632s 272ms/step - loss: 2.9614
Epoch 1/1
6000/6000 [==============================] - 1580s 263ms/step - loss: 2.8925
Epoch 1/1
6000/6000 [==============================] - 1308s 218ms/step - loss: 2.8350
Epoch 1/1
6000/6000 [==============================] - 1205s 201ms/step - loss: 2.7864
Epoch 1/1
6000/6000 [==============================] - 1206s 201ms/step - loss: 2.750