In [4]:
import github_command as gt

In [57]:
gt.push(file_to_transfer='TD7_Image_Captioning_CNN_RNN.ipynb',
       message="initial commit",
       repos="TDs_ESILV.git")

# Image captionning

### CNN Network part
#### Get the InceptionV3 model trained on imagenet data

In [54]:
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
model = InceptionV3(weights='imagenet')
# Remove the last layer (output softmax layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [56]:
model_new.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv2d_189 (Conv2D)             (None, None, None, 3 864         input_3[0][0]                    
__________________________________________________________________________________________________
batch_normalization_189 (BatchN (None, None, None, 3 96          conv2d_189[0][0]                 
__________________________________________________________________________________________________
activation_189 (Activation)     (None, None, None, 3 0           batch_normalization_189[0][0]    
__________________________________________________________________________________________________
conv2d_190

### Data part

#### Image descriptions


In [38]:
folder_proj_path="/Users/lucbertin/Downloads/demos/flickr30k_images/"
captions_file = "results.csv"

In [39]:
import subprocess
subprocess.check_output(["head", "-n", "2", folder_proj_path+captions_file])

b'image_name| comment_number| comment\n1000092795.jpg| 0| Two young guys with shaggy hair look at their hands while hanging out in the yard .\n'

In [41]:
import pandas as pd
## Open descriptions dataset and append corresponding images
df = pd.read_csv(folder_proj_path+captions_file, sep='|')
df.head()

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...
1,1000092795.jpg,1,"Two young , White males are outside near many..."
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .
4,1000092795.jpg,4,Two friends enjoy time spent together .


In [43]:
print(df.shape)
df.columns = df.columns.str.replace(' ', '')
print(df.columns)

(158915, 3)
Index(['image_name', 'comment_number', 'comment'], dtype='object')


In [110]:
## Appending startseq and endseq to each comment
df['comment2'] = ("startseq "  +  df.comment
                                        .str.lower()
                                        .str.replace(r"[^a-z0-9 ]", "")
                                        .str.split().str.join(" ") + " endseq")

In [111]:
## Just take words occuring at least 10 times
#len(df.comment2.str.split(' ').tolist())
from collections import Counter
all_words = [item for sublist in df.comment2.str.split(' ').tolist() for item in sublist]
more_than_10_occurences = {k:val for k,val in Counter(all_words).items() if val>=10}
#more_than_10_occurences
df['comment2'] = df['comment2'].str.split(" ").apply(lambda val: [x for x in val if more_than_10_occurences.get(x) is not None])
len(more_than_10_occurences)

5463

In [112]:
TARGET_SIZE=(299,299)

In [113]:
def encode(image, model_new):
    """ Function to encode a given image into a vector of size (2048, ) using inceptionV3 """
    from keras.applications.inception_v3 import preprocess_input
    import numpy as np
    image = np.array(image) # transform img to array
    image = np.expand_dims(image, axis=0) # add one dimension for batch (keras needs it)
    image = preprocess_input(image) # preprocess the image for inceptionV3
    fea_vec = model_new.predict(image) # The model beign trained already, get the encoding vector for the image after a forward pass
    fea_vec = np.reshape(fea_vec, -1) # reshape from (1, 2048) to (2048, )
    return fea_vec

def load_batch_of_images(df, batch_size, model_for_encoding, folder_imgs_path, TARGET_SIZE=TARGET_SIZE):
    """ This function will only load batch_size pictures at a time (for computations)"""
    from PIL import Image as Im
    import pandas as pd
    # take a sample from the main dataset
    df_sub = df.sample(n=batch_size).reset_index(drop=True)
    # open corresponding images in new column
    df_sub['image'] = df_sub.image_name.apply( lambda x: Im.open(folder_imgs_path+x).resize(TARGET_SIZE))
    # transform to array, preprocess and encode images
    df_sub['image'] = df_sub.image.apply(lambda x: encode(x, model_for_encoding))
    return df_sub

def create_dictionnaries_for_string_convertion(vocab):
    """ Create an index to word dictionnary and a word to index one """
    ixtoword, wordtoix = {}, {}
    ix = 1
    for w in vocab:
        wordtoix[w] = ix
        ixtoword[ix] = w
        ix += 1
    return ixtoword, wordtoix

ixtoword, wordtoix = create_dictionnaries_for_string_convertion(vocab=more_than_10_occurences)
maximum_length_caption_on_all_dataset = max(df.comment2.apply(len)) # max caption length ( for homogeneity of input vectors )
maximum_length_caption_on_all_dataset

78

In [121]:
df[df["comment2"].apply(lambda x: len(x)==78)].comment2[16050]

['startseq',
 'a',
 'man',
 'wearing',
 'a',
 'helmet',
 'red',
 'pants',
 'with',
 'white',
 'stripes',
 'going',
 'down',
 'the',
 'sides',
 'and',
 'a',
 'white',
 'and',
 'red',
 'shirt',
 'is',
 'on',
 'a',
 'small',
 'bicycle',
 'using',
 'only',
 'his',
 'hands',
 'while',
 'his',
 'legs',
 'are',
 'up',
 'in',
 'the',
 'air',
 'while',
 'another',
 'man',
 'wearing',
 'a',
 'light',
 'blue',
 'shirt',
 'with',
 'dark',
 'blue',
 'trim',
 'and',
 'black',
 'pants',
 'with',
 'red',
 'stripes',
 'going',
 'up',
 'the',
 'sides',
 'is',
 'standing',
 'nearby',
 'gesturing',
 'toward',
 'the',
 'first',
 'man',
 'and',
 'holding',
 'a',
 'small',
 'of',
 'one',
 'of',
 'the',
 'seven',
 'endseq']

In [80]:
str(ixtoword)[:100], str(wordtoix)[:100]

("{1: 'startseq', 2: 'two', 3: 'young', 4: 'guys', 5: 'with', 6: 'shaggy', 7: 'hair', 8: 'look', 9: 'a",
 "{'startseq': 1, 'two': 2, 'young': 3, 'guys': 4, 'with': 5, 'shaggy': 6, 'hair': 7, 'look': 8, 'at':")

In [92]:
%time sub = load_batch_of_images(df, batch_size=32, model_for_encoding=model_new, folder_imgs_path=folder_proj_path)
sub.loc[0, 'image'].shape

CPU times: user 23.2 s, sys: 4.81 s, total: 28 s
Wall time: 4.59 s


(2048,)

(2048,)