In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras

import string
import re

from sklearn.model_selection import train_test_split

from imageio import imread

from keras.applications import InceptionV3
from keras import Model

from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding , Input , Dropout , Dense , LSTM , add

import os
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Problem description

Our goal is to show how neural networks behave during automated image captioning. We aim to understand what are the composants of the neural network that are activated and that participates in the recognition of an "object" in the image. 

In a Second step, we will try to see if we can reduce the size of the neural network using this information to produce a smaller neiral net with the same behaviour as the first

## Building the model

Our aim is not accuracy, we want a neural network with a decent accuracy for this kind of problems that will allow us to analyze its behaviour.

Reading the data, creating descriptions, a dict containing a mapping from every picture name to its captioning.

Most of these steps are taken from https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8

Our goal is not to create the model but to visualize what is happening when used

In [2]:
# Get the InceptionV3 model trained on imagenet data
model = InceptionV3(weights='imagenet')
# Remove the last layer (output softmax layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [16]:
images_names = os.listdir("Flickr8k/Flicker8k_Dataset/")
images_names[0]

'2387197355_237f6f41ee.jpg'

In [17]:
images_names = [elem.split(".")[0] for elem in images_names]
images_names[0]

'2387197355_237f6f41ee'

In [18]:
i=0
for elem in images_names:
    if elem not in train_descriptions and elem not in test_descriptions and elem not in val_descriptions:        
        i+=1
        
print(i)

91


In [3]:
with open("train_data_encoded.pickle" , "rb") as file:
    unpickler = pickle.Unpickler(file)
    train_data = unpickler.load()

In [10]:
train_descriptions = dict( [ ( key , val["descriptions"] ) for key,val in train_data.items() ] )

In [6]:
with open("vocab.pickle" , "rb") as file:
    vocab = pickle.Unpickler(file).load()

In [7]:
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

In [8]:
def get_max_len(descriptions):
    return max( *[ max( *[ len( d.split() ) for d in desc ] ) for desc in descriptions.values() ] )

def get_max_len2(descriptions):
    return max( *[ len( d.split() ) for desc in descriptions.values() for d in desc ] )

In [11]:
get_max_len2(train_descriptions)

34

In [12]:
len(ixtoword) , len(wordtoix) , len(vocab)

(1721, 1721, 1721)

In [79]:
def data_generator(data ,wordtoix, max_length, num_photos_per_batch):
    
    descriptions =  dict( [ ( key , val['descriptions'] ) for key,val in data.items() ] )
    photos = dict( [ ( key , ( val['features'] )) for key,val in data.items() ] )

    # data generator, intended to be used in a call to model.fit_generator()
    def __data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
        vocab_size = len( wordtoix )
        X1, X2, y = list(), list(), list()
        n=0
        # loop for ever over images
        while True:
            for key, desc_list in descriptions.items():
                n+=1
                # retrieve the photo feature
                photo = photos[key]
                for desc in desc_list:
                    # encode the sequence
                    seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                    # split one sequence into multiple X, y pairs
                    for i in range(1, len(seq)):
                        # split into input and output pair
                        in_seq, out_seq = seq[:i], seq[i]
                        # pad input sequence
                        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                        # encode output sequence
                        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                        # store
                        X1.append(photo)
                        X2.append(in_seq)
                        y.append(out_seq)
                # yield the batch data
                if n==num_photos_per_batch:
                    yield [ [ np.array(X1), np.array(X2)], np.array(y) ]
                    X1, X2, y = list(), list(), list()
                    n=0
                    
    return __data_generator( descriptions , photos , wordtoix , max_length , num_photos_per_batch )

In [44]:
num_photos_per_batch = 5
max_length = get_max_len( train_descriptions )

print(f'n photos/batch : {num_photos_per_batch}\nmax sequence length : {max_length}')

n photos/batch : 5
max sequence length : 34


In [45]:
train_data_gen = data_generator( train_data , wordtoix , max_length , num_photos_per_batch)

In [46]:
with open("../../Downloads/glove.6B.200d.txt") as file:
    lines = file.readlines()

glove_dict = dict( [ ( elem.split()[0] , np.asarray( elem.split()[1:] , dtype='float32' ) ) for elem in lines ] )
del(lines)

In [47]:
embedding_dim = 200

In [48]:
embedding_matrix = dict( [ ( word , glove_dict.get( word , np.zeros( embedding_dim ) ) ) for word in wordtoix ] )

In [49]:
features_input_shape = train_data[ next( iter( train_data.keys() ) ) ]['features'].shape
vocab_size = len( wordtoix )

print(f'features input shape : {features_input_shape}\nvocab size : {vocab_size}\nembedding dim : {embedding_dim}')

features input shape : (1, 2048)
vocab size : 1721
embedding dim : 200


In [50]:
# image feature extractor model
inputs1 = Input( shape = features_input_shape )
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# partial caption sequence model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# decoder (feed forward) model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# merge the two input models
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [51]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1, 2048)      0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 34, 200)      344200      input_5[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 1, 2048)      0           input_4[0][0]                    
__________________________________________________________________________________________________
dropout_4 

In [52]:
embedding_matrix = dict( [ ( wordtoix[key] , val) for key,val in embedding_matrix.items() ] ) 

In [53]:
model.layers[2].set_weights( [ np.array([ embedding_matrix[i] for i in sorted( embedding_matrix.keys() ) ] ) ] )
model.layers[2].trainable = False

In [54]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [55]:
num_photos_per_batch = 30
steps = len(train_descriptions)//num_photos_per_batch

print(f'n photos/batch : {num_photos_per_batch}\nsteps : {steps}')

n photos/batch : 30
steps : 213


In [56]:
train_data_gen = data_generator( train_data , wordtoix , max_length , num_photos_per_batch )

In [80]:
with open('val_data_encoded.pickle' , 'rb') as file:
    unpickler = pickle.Unpickler(file)
    val_data = unpickler.load()

print(f'length val data : {len(val_data)}')

length val data : 800


In [81]:
val_data = next( data_generator( val_data , wordtoix , max_length , num_photos_per_batch = len( val_data ) ) )

### Fitting the model, takes several hours

Better load the model and use it

In [None]:
epochs = 3

for epoch in range(0, epochs):
    num_photos_per_batch = (2**epochs) * 5 / (2**epoch)
    steps = len(train_descriptions)//num_photos_per_batch
    model.optimizer.lr = 0.001 / (2**epoch)
    model.fit_generator(train_data_gen , epochs=1, steps_per_epoch=steps, verbose=1 )
    
model.save(f'model__lr_{model.optimizer.lr}.h5')
model.save_weights('my_weights.h5')

### References: 

dataset : Flickr 8k Images

code inspiration and code : https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8