In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras

import string
import re

from sklearn.model_selection import train_test_split

from imageio import imread

from keras.applications import InceptionV3
from keras import Model

from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input

import os
import pickle

# Problem description

Our goal is to show how neural networks behave during automated image captioning. We aim to understand what are the composants of the neural network that are activated and that participates in the recognition of an "object" in the image. 

In a Second step, we will try to see if we can reduce the size of the neural network using this information to produce a smaller neural net with the same behaviour as the first

## Building the model

Our aim is not accuracy, we want a neural network with a decent accuracy for this kind of problems that will allow us to analyze its behaviour.

Reading the data, creating descriptions, a dict containing a mapping from every picture name to its captioning.

Most of these steps are taken from https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8

Our goal is not to create the model but to visualize what is happening when used

In [73]:
with open("flickr_8k_train_dataset.txt") as file:
    lines = file.readlines()[1:]
    
print(f"len lines : {len(lines)}")

with open("flickr_8k_test_dataset.txt") as file:
    lines += file.readlines()[1:]

print(f"len lines : {len(lines)}")

with open("flickr_8k_val_dataset.txt") as file:
    lines += file.readlines()[1:]
    
print(f"len lines : {len(lines)}")


len lines : 30000
len lines : 35000
len lines : 40000


In [112]:
lines[0]

'2513260012_03d33305cf.jpg\t<start> A black dog is running after a white dog in the snow . <end>\n'

In [105]:
descriptions = dict()
mapping = dict()
for line in lines:
    # split line by white space
    tokens = line.split()
    
    # take the first token as image id, the rest as description
    image_id, image_desc = tokens[0], tokens[1:]
    
    # extract filename from image id
    image_id = image_id.split('.')[0]
    
    # convert description tokens back to string
    image_desc = ' '.join(image_desc)
    if image_id not in descriptions:
        descriptions[image_id] = list()
    descriptions[image_id].append(image_desc)

In [77]:
# prepare translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        # tokenize
        desc = desc.split()
        # convert to lower case
        desc = [word.lower() for word in desc]
        # remove punctuation from each token
        desc = [w.translate(table) for w in desc]
        # remove hanging 's' and 'a'
        desc = [word for word in desc if len(word)>1]
        # remove tokens with numbers in them
        desc = [word for word in desc if word.isalpha()]
        # store as string
        desc_list[i] =  ' '.join(desc)

In [78]:
descriptions[[elem for elem in descriptions.keys()][0]]

['start black dog is running after white dog in the snow end',
 'start black dog chasing brown dog through snow end',
 'start two dogs chase each other across the snowy ground end',
 'start two dogs play together in the snow end',
 'start two dogs running through low lying body of water end']

In [79]:
vocabulary = set()
for key in descriptions.keys():
    [vocabulary.update(d.split()) for d in descriptions[key]]
print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 8657


In [80]:
train_descriptions, test_descriptions = train_test_split([elem for elem in descriptions], test_size=0.2)

In [81]:
test_descriptions, val_descriptions = train_test_split(test_descriptions, test_size = 0.5)

In [82]:
train_descriptions = dict( [ ( desc, descriptions[desc] ) for desc in train_descriptions] )
test_descriptions = dict( [ ( desc, descriptions[desc] ) for desc in test_descriptions] )
val_descriptions = dict( [ ( desc, descriptions[desc] ) for desc in val_descriptions] )


In [83]:
if len(train_descriptions) + len(test_descriptions) + len(val_descriptions) == len(descriptions):
    del(descriptions)
else:
    raise Error("Data is missing")

In [84]:
# Create a list of all the training captions
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)

# Consider only words which occur at least 10 times in the corpus
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] + ['startseq' , 'endseq' , '0']

print('preprocessed words %d ' % len(vocab))

preprocessed words 1721 


In [122]:
with open("vocab.pickle" , "wb") as file:
    pickle.Pickler(file).dump(vocab)

In [85]:
# Replace start and end by 'startseq' and 'endseq' for each sentence that starts with 'start' or ends with 'end'
def replace_starting_seq(s, seq, new_seq):
    return s if not s.startswith(seq) else new_seq + s[len(seq):]
def replace_ending_seq(s, seq, new_seq):
    return s if not s.endswith(seq) else  s[:-len(seq)] + new_seq

train_descriptions = dict( [ ( key , list( map( lambda x : replace_starting_seq( replace_ending_seq( x , 'end' , 'endseq' ) , 'start' , 'startseq' ) , descs ) ) ) for key , descs in train_descriptions.items() ] )

In [15]:
# Get the InceptionV3 model trained on imagenet data
model = InceptionV3(weights='imagenet')
# Remove the last layer (output softmax layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [18]:
images_names = os.listdir("Flickr8k/Flicker8k_Dataset/")
images_names[0]

'2387197355_237f6f41ee.jpg'

In [19]:
images_names = [elem.split(".")[0] for elem in images_names]
images_names[0]

'2387197355_237f6f41ee'

In [115]:
train_names = iter(train_descriptions.keys())

In [117]:
train_imgs = [ ( tr_name , image.load_img(f"Flickr8k/Flicker8k_Dataset/{tr_name}.jpg", target_size=(299, 299)) ) for tr_name in train_descriptions]


In [118]:
test_imgs = [ ( te_name , image.load_img(f"Flickr8k/Flicker8k_Dataset/{te_name}.jpg", target_size=(299, 299)) ) for te_name in test_descriptions]


In [119]:
val_imgs = [ ( val_name , image.load_img(f"Flickr8k/Flicker8k_Dataset/{val_name}.jpg", target_size=(299, 299)) ) for val_name in val_descriptions]


In [120]:
train_imgs_dico = dict( [ ( key , { 'descriptions' : train_descriptions[key] , 'features' : np.expand_dims( np.array( tr_d ) , axis=0 )} ) for  key , tr_d in train_imgs ] )

test_imgs_dico = dict( [ ( key , { 'descriptions' : test_descriptions[key] , 'features' : np.expand_dims( np.array( te_d ) , axis=0 )} ) for  key , te_d in test_imgs ] )

val_imgs_dico = dict( [ ( key , { 'descriptions' : val_descriptions[key] , 'features' : np.expand_dims( np.array( val_d ) , axis=0 )} ) for  key , val_d in val_imgs ] )



In [121]:
# Save for every image (key) the descriptions and the vector representation
# Will be used during exploration of feature extractions from the original data

with open('train_imgs.pickle' , 'wb') as file:
    
    pickler = pickle.Pickler( file )
    pickler.dump( train_imgs_dico )
    
with open('test_imgs.pickle' , 'wb') as file:
    
    pickler = pickle.Pickler(file)
    pickler.dump( test_imgs_dico )
    
with open('val_imgs.pickle' , 'wb') as file:
    
    pickler = pickle.Pickler(file)
    pickler.dump( val_imgs_dico )

#### Feature extraction using InceptionV3 ( transfer learning )

This step is computationally heavy, can take several hours

In [53]:
train_data =  [  model_new.predict( np.expand_dims( image.img_to_array( tr_im ) , axis=0 ) ) for tr_im in train_imgs ] 

In [54]:
test_data =  [  model_new.predict( np.expand_dims( image.img_to_array( te_im ) , axis=0 ) ) for te_im in test_imgs ] 

In [55]:
val_data =  [  model_new.predict( np.expand_dims( image.img_to_array( val_im ) , axis=0 ) ) for val_im in val_imgs ]

In [86]:
train_data_dico = dict( [ ( key , { 'descriptions' : descs , 'features' : tr_d } ) for ( key , descs ) , tr_d in zip(train_descriptions.items() , train_data) ] )

test_data_dico = dict( [ ( key , { 'descriptions' : descs , 'features' : te_d } ) for ( key , descs ) , te_d in zip(test_descriptions.items() , test_data) ] )

val_data_dico = dict( [ ( key , { 'descriptions' : descs , 'features' : val_d } ) for ( key , descs ) , val_d in zip(val_descriptions.items() , val_data) ] )


In [87]:
# Save each image (key) with its descriptions (captions) and the encoded features vector

with open('train_data_encoded.pickle' , 'wb') as file:
    
    pickler = pickle.Pickler( file )
    pickler.dump( train_data_dico )
    
with open('test_data_encoded.pickle' , 'wb') as file:
    
    pickler = pickle.Pickler(file)
    pickler.dump( test_data_dico )
    
with open('val_data_encoded.pickle' , 'wb') as file:
    
    pickler = pickle.Pickler(file)
    pickler.dump( val_data_dico )

### References: 

dataset : Flickr 8k

code and code inspiration : https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8
