In [238]:
import github_command as gt

In [251]:
gt.push(file_to_transfer='TD7_Image_Captioning_CNN_RNN.ipynb',
       message="model definition",
       repos="TDs_ESILV.git")

In [4]:
from functools import wraps
from time import time

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print('func:%r args:[%r, %r] took: %2.4f sec' % \
          (f.__name__, args, kw, te-ts))
        return result
    return wrap

# Image captionning

### CNN Network part
#### Get the InceptionV3 model trained on imagenet data

In [54]:
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model
model = InceptionV3(weights='imagenet')
# Remove the last layer (output softmax layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

### Data part

#### Image descriptions


In [210]:
folder_proj_path="/Users/lucbertin/Downloads/demos/"
flickr_folder = "flickr30k_images/"
captions_file = "results.csv"

In [211]:
import subprocess
subprocess.check_output(["head", "-n", "2", folder_proj_path+flickr_folder+captions_file])

b'image_name| comment_number| comment\n1000092795.jpg| 0| Two young guys with shaggy hair look at their hands while hanging out in the yard .\n'

In [41]:
import pandas as pd
## Open descriptions dataset and append corresponding images
df = pd.read_csv(folder_proj_path+flickr_folder+captions_file, sep='|')
df.head()

Unnamed: 0,image_name,comment_number,comment
0,1000092795.jpg,0,Two young guys with shaggy hair look at their...
1,1000092795.jpg,1,"Two young , White males are outside near many..."
2,1000092795.jpg,2,Two men in green shirts are standing in a yard .
3,1000092795.jpg,3,A man in a blue shirt standing in a garden .
4,1000092795.jpg,4,Two friends enjoy time spent together .


In [43]:
print(df.shape)
df.columns = df.columns.str.replace(' ', '')
print(df.columns)

(158915, 3)
Index(['image_name', 'comment_number', 'comment'], dtype='object')


In [110]:
## Appending startseq and endseq to each comment
df['comment2'] = ("startseq "  +  df.comment
                                        .str.lower()
                                        .str.replace(r"[^a-z0-9 ]", "")
                                        .str.split().str.join(" ") + " endseq")

In [111]:
## Just take words occuring at least 10 times
#len(df.comment2.str.split(' ').tolist())
from collections import Counter
all_words = [item for sublist in df.comment2.str.split(' ').tolist() for item in sublist]
more_than_10_occurences = {k:val for k,val in Counter(all_words).items() if val>=10}
#more_than_10_occurences
df['comment2'] = df['comment2'].str.split(" ").apply(lambda val: [x for x in val if more_than_10_occurences.get(x) is not None])
len(more_than_10_occurences)

5463

In [112]:
TARGET_SIZE=(299,299)

In [176]:
def encode(image, model_new):
    """ Function to encode a given image into a vector of size (2048, ) using inceptionV3 """
    from keras.applications.inception_v3 import preprocess_input
    import numpy as np
    image = np.array(image) # transform img to array
    image = np.expand_dims(image, axis=0) # add one dimension for batch (keras needs it)
    image = preprocess_input(image) # preprocess the image for inceptionV3
    fea_vec = model_new.predict(image) # The model beign trained already, get the encoding vector for the image after a forward pass
    fea_vec = np.reshape(fea_vec, -1) # reshape from (1, 2048) to (2048, )
    return fea_vec

def create_dictionnaries_for_string_convertion(vocab):
    """ Create an index to word dictionnary and a word to index one """
    ixtoword, wordtoix = {}, {}
    ix = 1
    for w in vocab:
        wordtoix[w] = ix
        ixtoword[ix] = w
        ix += 1
    return ixtoword, wordtoix

ixtoword, wordtoix = create_dictionnaries_for_string_convertion(vocab=more_than_10_occurences)
maximum_length_caption_on_all_dataset = max(df.comment2.apply(len)) # max caption length ( for homogeneity of input vectors )
maximum_length_caption_on_all_dataset

78

In [179]:
" ".join(df[df["comment2"].apply(lambda x: len(x)==78)].comment2[16050])

'startseq a man wearing a helmet red pants with white stripes going down the sides and a white and red shirt is on a small bicycle using only his hands while his legs are up in the air while another man wearing a light blue shirt with dark blue trim and black pants with red stripes going up the sides is standing nearby gesturing toward the first man and holding a small of one of the seven endseq'

In [180]:
str(ixtoword)[:100], str(wordtoix)[:100]

("{1: 'startseq', 2: 'two', 3: 'young', 4: 'guys', 5: 'with', 6: 'shaggy', 7: 'hair', 8: 'look', 9: 'a",
 "{'startseq': 1, 'two': 2, 'young': 3, 'guys': 4, 'with': 5, 'shaggy': 6, 'hair': 7, 'look': 8, 'at':")

#### encoding sequence and padding (for creation of inputs) #####

In [551]:
def encoding__padding_inputs_seq(sequence, vocab, shift, max_length):
    encoding = list(map(vocab.get, sequence[:shift+1]))
    encoding += [0]*(max_length-len(encoding))
    return encoding

In [552]:
a = ["the", "rabbit", "is", "in", "the","kitchen"] 

In [553]:
[encoding__padding_inputs_seq(sequence=a, shift=i, vocab=wordtoix, max_length=6) for i in range(len(a))]

[[16, 0, 0, 0, 0, 0],
 [16, 4380, 0, 0, 0, 0],
 [16, 4380, 68, 0, 0, 0],
 [16, 4380, 68, 15, 0, 0],
 [16, 4380, 68, 15, 16, 0],
 [16, 4380, 68, 15, 16, 105]]

#### one-hot encoding based on vocab encoding for outputs (result in softmax) #####

In [611]:
def encoding_outputs_from_seq(sequence, shift, vocab):
    import numpy as np
    encoding = vocab.get(sequence[shift])
    return np.eye(len(vocab)+1)[encoding, :]

In [612]:
[encoding_outputs_from_seq(sequence=a, shift=i, vocab=wordtoix) for i in range(len(a))]

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.])]

## Generate a batch of images then the sequence of Xt inputs with respective targets Yt

In [662]:
df_sub.input_sequences

0      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1      [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2      [1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3      [1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4      [1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                             ...                        
110    [1, 62, 26, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
111    [1, 62, 26, 51, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
112    [1, 62, 26, 51, 30, 63, 0, 0, 0, 0, 0, 0, 0, 0...
113    [1, 62, 26, 51, 30, 63, 64, 0, 0, 0, 0, 0, 0, ...
114    [1, 62, 26, 51, 30, 63, 64, 18, 0, 0, 0, 0, 0,...
Name: input_sequences, Length: 115, dtype: object

In [None]:
df_sub['input_sequences']  = df_sub.comment2.apply(lambda x: [encoding__padding_inputs_seq(sequence=x, shift=i, vocab=wordtoix, max_length=maximum_length_caption_on_all_dataset) for i in range(len(x))])
df_sub['output_sequences'] = df_sub.comment2.apply(lambda x:[encoding_outputs_from_seq(sequence=x, shift=i, vocab=wordtoix) for i in range(len(x))])
df_sub = df_sub.apply(lambda x: x.apply(pd.Series).stack()).reset_index(drop=True).ffill()

In [665]:
def load_batch_of_images(df, batch_size, model_for_encoding, folder_imgs_path, TARGET_SIZE=TARGET_SIZE):
    """ This function will only load batch_size pictures at a time (for computations)"""
    from PIL import Image as Im
    import pandas as pd, numpy as np
    
    # iterate for ever (check Keras documentation)
    while 1:
        ## shuffling
        df = df.sample(n=len(df))
        ## One epoch = One loop
        for batch_i in range(len(df)//batch_size):
            ### Images encoding part ####
            # take a sample from the main dataset
            df_sub = df.loc[:batch_i*batch_size].reset_index(drop=True)
            # open corresponding images in new column
            df_sub['image'] = df_sub.image_name.apply( lambda x: Im.open(folder_imgs_path+x).resize(TARGET_SIZE))
            # transform to array, preprocess and encode images
            df_sub['image'] = df_sub.image.apply(lambda x: encode(x, model_for_encoding))
            
            ### Word sequence convertion to index then embedding part ####
            df_sub['input_sequences']  = df_sub.comment2.apply(lambda x: [encoding__padding_inputs_seq(sequence=x, shift=i, vocab=wordtoix, max_length=maximum_length_caption_on_all_dataset) for i in range(len(x))])
            df_sub['output_sequences'] = df_sub.comment2.apply(lambda x:[encoding_outputs_from_seq(sequence=x, shift=i, vocab=wordtoix) for i in range(len(x))])
            df_sub.apply(lambda x: x.apply(pd.Series).stack()).reset_index(drop=True).ffill()
            
            
            # [[input1, input2],  output]
            return [[np.array(df_sub.image), np.array(), np.array()]]

In [664]:
[ input1, input2], output = load_batch_of_images(df, batch_size=32, model_for_encoding=model_new, folder_imgs_path=folder_proj_path+flickr_folder)
sub.loc[0, 'image'].shape

KeyboardInterrupt: 

## Word Embedding
### gives a vector representation of words converted into numerical indexes


#@email_sender(recipient_emails=["<your_email@address.com>", "<your_second_email@address.com>"], sender_email="<grandma's_email@gmail.com>")

In [215]:
glove_file="glove/glove.6B.200d.txt"

In [218]:
subprocess.check_output(["head", "-n", "1", folder_proj_path+glove_file])

b'the -0.071549 0.093459 0.023738 -0.090339 0.056123 0.32547 -0.39796 -0.092139 0.061181 -0.1895 0.13061 0.14349 0.011479 0.38158 0.5403 -0.14088 0.24315 0.23036 -0.55339 0.048154 0.45662 3.2338 0.020199 0.049019 -0.014132 0.076017 -0.11527 0.2006 -0.077657 0.24328 0.16368 -0.34118 -0.06607 0.10152 0.038232 -0.17668 -0.88153 -0.33895 -0.035481 -0.55095 -0.016899 -0.43982 0.039004 0.40447 -0.2588 0.64594 0.26641 0.28009 -0.024625 0.63302 -0.317 0.10271 0.30886 0.097792 -0.38227 0.086552 0.047075 0.23511 -0.32127 -0.28538 0.1667 -0.0049707 -0.62714 -0.24904 0.29713 0.14379 -0.12325 -0.058178 -0.001029 -0.082126 0.36935 -0.00058442 0.34286 0.28426 -0.068599 0.65747 -0.029087 0.16184 0.073672 -0.30343 0.095733 -0.5286 -0.22898 0.064079 0.015218 0.34921 -0.4396 -0.43983 0.77515 -0.87767 -0.087504 0.39598 0.62362 -0.26211 -0.30539 -0.022964 0.30567 0.06766 0.15383 -0.11211 -0.09154 0.082562 0.16897 -0.032952 -0.28775 -0.2232 -0.090426 1.2407 -0.18244 -0.0075219 -0.041388 -0.011083 0.078186 0

In [214]:
import knockknock

### Load the whole embedding into memory



In [221]:
embeddings_index = {} # empty dictionary
with open(folder_proj_path+glove_file, encoding="utf-8") as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Loaded {} word vectors.'.format(len(embeddings_index)))

Loaded 400000 word vectors.


### transform integer vector representation to dense one

In [227]:
embedding_dim

200

* embeddings_index associate a **word** to a **vector representation**
* wordtoix associate a **word** to a **integer number**



In [233]:
# Get 200-dim dense vector for each of the 5464 words in out vocabulary (word_to_idx)
embedding_matrix = np.zeros((len(wordtoix)+1, 200)) # 200: embedding dim: the Dense representation of the word with '200 like features'
for word, i in wordtoix.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [237]:
embedding_matrix.shape

(5464, 200)

* To each 5464 word is **associated a vector** 
* It's a **stack of vectors and the index i of the matrix is associated to the index of the word itself**

## Model definition

In [245]:
maximum_length_caption_on_all_dataset
vocab_size = len(wordtoix) + 1
embedding_dim = 200 # dense words representation

In [250]:
from keras.layers import Dense, Dropout, LSTM, Input, Embedding, add
from keras.models import Model

# image feature extractor model (as 2048 vector of features)
inputs1 = Input(shape=(2048,))
fe1     = Dropout(0.5)(inputs1)
fe2     = Dense(256, activation='relu')(fe1)

# partial caption sequence model (as max size of sequences (padding: 78 values in list))
inputs2 = Input(shape=(maximum_length_caption_on_all_dataset,)) 
se1     = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(inputs2)
se2     = Dropout(0.5)(se1)
se3     = LSTM(256)(se2)

# decoder (feed forward) model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs  = Dense(vocab_size, activation='softmax')(decoder2)

# merge the two input models
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [252]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 78)           0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 78, 200)      1092800     input_13[0][0]                   
__________________________________________________________________________________________________
dropout_8 (Dropout)             (None, 2048)         0           input_12[0][0]                   
__________________________________________________________________________________________________
dropout_9 

In [254]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [255]:
epochs = 10
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath


NameError: name 'train_descriptions' is not defined

In [None]:
model.fit_generator()