#  Image Caption Generator CNN-RNN Model

The Image Caption Generator CNN-RNN model using CNN and LSTM
- CNN: used for extracting features from image
- LSTM: used to genreate description of image

In [21]:
## Imports
import string
import numpy as np
import pandas as pd
from PIL import Image
import os
from pickle import dump, load

from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.layers.merge import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [2]:
# small library for seeing the progress of loops.
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  from pandas import Panel


## Utility Functions

**load_doc( filename )** – For loading the document file and reading the contents inside the file into a string.

**all_img_captions( filename )** – This function will create a descriptions dictionary that maps images with a list of 5 captions. The descriptions dictionary will look something like this:

**cleaning_text( descriptions)** – This function takes all descriptions and performs data cleaning. This is an important step when we work with textual data, according to our goal, we decide what type of cleaning we want to perform on the text. In our case, we will be removing punctuations, converting all text to lowercase and removing words that contain numbers. So, a caption like “A man riding on a three-wheeled wheelchair” will be transformed into “man riding on three wheeled wheelchair”

**text_vocabulary( descriptions )** – This is a simple function that will separate all the unique words and create the vocabulary from all the descriptions.

**save_descriptions( descriptions, filename )** – This function will create a list of all the descriptions that have been preprocessed and store them into a file. We will create a descriptions.txt file to store all the captions.

In [3]:
'''
Loads a text file `filename` into memory
'''
def load_doc(filename):
    # Opening the file as read only
    file = open(filename, 'r')
    
    # extract all the text in the file to return
    text = file.read()
    file.close()
    return text

'''
Get all imgs with their captions and 
returns a dictionary of all caption from `filename`.
'''
def all_img_captions(filename):
    ## Extract teext from the file
    file = load_doc(filename)
    ## Split each caption to store as description dictionary
    captions = file.split('\n')
    descriptions ={}
    
    for caption in captions[:-1]:
        ## split an image from its caption
        img, caption = caption.split('\t')
        ## Map the image to the caption in descripions
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

'''
Data cleaning- 
- lower casing
- removing puntuations
- words containing numbers
'''
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):

            img_caption.replace("-"," ")
            desc = img_caption.split()

            #converts to lowercase
            desc = [word.lower() for word in desc]
            
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            
            #remove hanging 's and a 
            desc = [word for word in desc if(len(word)>1)]
            
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            
            #convert back to string and store it back to its original caption
            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

'''
Separates all the unique words and 
create the vocabulary from all the descriptions.
`descriptions` result from all_img_captions(filename)
'''
def text_vocabulary(descriptions):
    # build vocabulary of all unique words
    vocab = set()

    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]

    return vocab


'''
Creates a list of all the descriptions that 
have been preprocessed and store them into a file.

All descriptions in one file 
'''
def save_descriptions(descriptions, filename):
    lines = list()
    
    ## Stringify entire descriptions to prepare to write to file
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    
    ## Write to and close file after writing to it
    file = open(filename,"w")
    file.write(data)
    file.close()

## Extracting and Cleaning Data

In [4]:
!ls data

!ls data/Flicker8k_Dataset > grep * .txt

[34mFlicker8k_Dataset[m[m    [34mFlickr8k_text[m[m
Flickr8k_Dataset.zip Flickr8k_text.zip
ls: .txt: No such file or directory


In [5]:
!ls dat

ls: dat: No such file or directory


In [6]:
## Specify paths for Flicker data
dataset_text = 'data/Flickr8k_text'
dataset_images = 'data/Flicker8k_Dataset'

## prepare text data
text_fn = dataset_text + "/" + "Flickr8k.token.txt"

## Load fn that contains all data and map into descriptions dictionary
descriptions = all_img_captions(text_fn)
print('Length of descripions: ', len(descriptions))


Length of descripions:  8092


In [7]:
## Clean descriptions
cleaned_descriptions = cleaning_text(descriptions)

## Building unique vocabulary 
vocabulary = text_vocabulary(cleaned_descriptions)
print("Length of vocabulary = ", len(vocabulary))

Length of vocabulary =  8763


In [8]:
## Save all descriptoins to the 'decriptions.txt' file
dest_fn = 'descriptions.txt'
save_descriptions(cleaned_descriptions, dest_fn)

## CNN: Extracting feature vector from all images

- Will take advatange of transfer learning so we can start from a pre-trained model
- Using Xception mode which was trained on `imageenet` with 1000 diffrenet classes to classify

`extract_features()` will extract features for all images and map image names with their respective feature array. This will theen dump features dictionary into 'features.p' pickle file


In [11]:
Xmodel = Xception(include_top=False, 
                  pooling='avg' )

In [18]:
def extract_features(directory):
    Xmodel = Xception( include_top=False, pooling='avg')
    
    features = {}
    for img in tqdm(os.listdir(directory)):
        fn = directory + "/" + img
        
        ## Open the image and resize to appropriatee size
        image = Image.open(fn)
        image = image.resize((299, 299))
        image = np.expand_dims(image, axis=0)
        
        image = image / 127.5
        image = image - 1.0
        
        ## predict and store efeature 
        feature = Xmodel.predict(image)
        features[img] = feature
    
    return features

In [19]:
## 2048 feature vector
features = extract_features(dataset_images)


HBox(children=(IntProgress(value=0, max=8091), HTML(value='')))




NameError: name 'features' is not defined

In [20]:
dump(features, open('features.p', 'wb'))

In [22]:
features = pd.read_pickle('features.p')

## Load dataset to train model

In `data/Flickr_8k_test/Flickr_8k.trainImages.txt`, there are 6000 image names that will be useed for training.

**load_photos( filename )** – This will load the text file in a string and will return the list of image names.

**load_clean_descriptions( filename, photos )** – This function will create a dictionary that contains captions for each photo from the list of photos. We also append the `<start>` and `<end>` identifier for each caption. We need this so that our LSTM model can identify the starting and ending of the caption.
    
**load_features(photos)** - This funciton will give us the dictionary for image names and their feature vector which we have previously extraccted from the Xception model.


In [12]:
'''
Load data from a file.
'''
def load_photos(fn):
    file = load_doc(fn)
    photos = file.split('\n')[:-1]
    return photos

'''
Creates a dictionary that contains captions for each photo from the list of photos. 
Appends a `start` and `end` tag for each caption so that LSTM will be able to denote.
'''
def load_clean_descriptions(fn, photos):
    ## Load cleaed_descriptoins 
    file = load_doc(fn)
    descriptions = {}
    for line in file.split('\n'):
        
        ## Extract the image and image caption that we found from CNN
        words = line.split()
        if len(words) < 1:
            continue
            
        image, image_caption = words[0], words[1:]
        
        ## Include the start and end tags as needeed
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + ' '.join(image_caption) + ' <ends>'
            descriptions[image].append(desc)
            
    return descriptions

'''
Give us the dictionary for image names and their feature vector
which we have previously extraccted from the Xception model.
'''
def load_features(photos):
    ## Load all features 
    all_features = load(open('features.p', 'rb'))
    
    ## select only the necessary features
    features = {k:all_features[k] for k in photos}
    
    return features

In [47]:
training_fn = dataset_text + "/" + "Flickr_8k.trainImages.txt"

train_imgs = load_photos(training_fn)
train_descriptions = load_clean_descriptions('descriptions.txt', train_imgs)
train_features = load_features(train_imgs)

## Tokenizing the Vocabulary

We need to represent each unique vocab word with a unique index value. This uses the Keras library's tokenizer function and save it as `tokenizer.p` pickle file.

In [48]:
'''
Converts dictionary to clean list of descriptions.
'''
def dict_to_list(descriptions):
    all_desc = []
    ## Collect all the values in the descriptions and append it to all_desc
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
        
    return all_desc

In [49]:
'''
Create a tokenizer class, which vectorizes text body.
Each integer maps to token in dictionary.
'''
def create_tokenizer(descriptions):
    ## collect the list of descriptions
    desc_list = dict_to_list(descriptions)
    
    ## initialie Keras Tokenizer and then fit on the fonud descriptions
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    
    return tokenizer

In [50]:
## Each word gets an index, which is stored in the `tokenizer.p` pickle file
tokenizer = create_tokenizer(train_descriptions)

dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
print('Entire vocab size: ', vocab_size)

Entire vocab size:  7577


In [51]:
## Also find the max_length of the descriptions so that model structure knows

'''
Calculates maximum length of the descriptions
'''
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max([len(d.split()) for d in desc_list])


In [52]:
max_length = max_length(descriptions)
max_length

32

## Data Generator

Train our model on 6000 images and each image will contain 2048 length feature vector and caption is also represented as numbers. This amount of data for 6000 images is not possible to hold into memory so we will be using a generator method that will yield batches.

Generator will yield input and output sequence.


In [53]:
## Create input-output sequence pairs from the image description
'''
Data generator, used by model.fit_generator()
'''
def data_generator(descriptions, features, tokenizer, max_length):
    while True:
        for key, description_list in descriptions.items():
            # Retrieve photo features
            feature = features[key][0]
            
            input_image, input_sequence, output_word = create_sequences(tokenizer, 
                                                                         max_length, 
                                                                         description_list, 
                                                                         feature)
            ## yield the sequence result
            yield [[input_image, input_sequence], output_word]
            
'''
Tokenizes the descriptions into multiple X, y pairs 
so that features, input, and output can be separated
'''
def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    
    ## Go through each description for the image
    for desc in desc_list:
        ## Encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        
        ## Split sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            ## Split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            
            ## Pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            
            ## Encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            
            ## Store the input and output 
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
            
    return np.array(X1), np.array(X2), np.array(y)
            
            

In [54]:
## can vereify shape of input and output
[a,b], c = next(data_generator(train_descriptions, features, tokenizer, max_length))

a.shape, b.shape, c.shape

((47, 2048), (47, 32), (47, 7577))

## Defining CNN-RNN model

Using Keras Model from Functional API, the model is mad eup of 3 major components:

1. **Feature Extractor** – The feature extracted from the image has a size of 2048, with a dense layer, we will reduce the dimensions to 256 nodes.

2. **Sequence Processor** – An embedding layer will handle the textual input, followed by the LSTM layer.

3. **Decoder** – By merging the output from the above two layers, we will process by the dense layer to make the final prediction. The final layer will contain the number of nodes equal to our vocabulary size.

![Input layers](images/CNN-RNN-architecture.png)

In [55]:
## Define captioning model

def define_model(vocab_size, max_length, summary_fn='model.png'):
    ## Features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(.5)(inputs1) ## prevents overfitting
    fe2 = Dense(256, activation='relu')(fe1) ## deeply connected
    
    ## LSTM Sequence Model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(.5)(se1)
    se3 = LSTM(256)(se2)
    
    ## Merging both models
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    ## all together, compiled 
    model = Model(inputs=[inputs1, inputs2],
                 outputs=outputs)
    
    model.compile(loss='categorical_crossentropy',
                 optimizer='adam')
    
    ## Summarizer model
    print(model.summary())
    plot_model(model, to_file=summary_fn, show_shapes=True)
    
    return model
    

## Training the CNN-RNN Model


In [60]:
import time

In [63]:
start = time.time()
print("hi")
print()
time.time() - start

hi



0.0004992485046386719

In [64]:
# train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)

# making a directory models to save our models
# os.mkdir("models")
start = time.time()
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch= steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")

finished = time.time() - start
print('Finished in : ', finished)

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7577
Description Length:  32
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 32)           0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 32, 256)      1939712     input_12[0][0]                   
__________________________________________________________________________________________________
dropout_9 (Dropout)             (None, 2048)         0           input_11[0][0]                   
__

## Testing the model

We can create a separate `testing_caption_generator.py` thatt will load the model and generate predictions.
Predictions contain max length of index values so we use the same `tokenizer.py` pickle file to get words from their index values

## References

- https://data-flair.training/blogs/python-based-project-image-caption-generator-cnn/