In [1]:
#https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/?fbclid=IwAR0cTakCljZurgoBWawU0fNGVS6DI7Gj_yllpKNCEHknyB0TVk4MPuOyAvY
import tensorflow as tf
from tensorflow.compat.v1 import Session

if len(tf.config.experimental.list_physical_devices('GPU')) > 0:
    print("GPU ready to be used")
else:
    print("/!\ Warning GPU not in use, computing might take a while")

GPU ready to be used


In [2]:
import text_preprocessing as tpp
import image_preprocessing as ipp
import data_load as loader
from pickle import dump, load

import numpy as np
import matplotlib.pyplot as plt

#Preprocessing
from tensorflow.keras.applications import resnet as preprocessor_resnet
from tensorflow.keras.applications import vgg16 as preprocessor_vgg

#Models
from tensorflow.keras.applications.vgg16 import VGG16

#NLP
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk import word_tokenize
from nltk.translate.bleu_score import corpus_bleu

#model
from tensorflow.keras import Input
from tensorflow.keras import Model
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import add
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint

from importlib import reload
reload(tpp)
reload(ipp)
reload(loader)

<module 'data_load' from 'C:\\Users\\Lucas\\Desktop\\ISEP_2020_2021\\Machine_Learning\\Project - Caption generator\\data_load.py'>

In [5]:
images_path = "D:\DATASETS\Flicker8k_Dataset"
captions_token_path = "D:\DATASETS\Flickr8k_text\Flickr8k.token.txt"

captions_train_path = "D:\DATASETS\Flickr8k_text\Flickr_8k.trainImages.txt"
captions_valid_path = "D:\DATASETS\Flickr8k_text\Flickr_8k.devImages.txt"

export_path = "Saves/"

vgg_features = export_path+"vgg_features.pkl"
resnet_features = export_path+"resnet_features.pkl"

model_path = "Model_weights/"

valid_path = "Validation/"

best_weights = "model-ep020-loss0.877.h5"

# Data extraction (Only run once)

In [15]:
text = tpp.load_text(captions_token_path)
tpp.extract_description(text,export_path+"descriptions.txt")
print(f"{len(text)} texts loaded")

Description saved to file Saves/descriptions.txt success.
3395237 texts loaded


### VGG16

In [18]:
images = ipp.load_images(images_path,preprocessor_vgg)
print(f"{len(images)} images loaded")

Loading complete
8091 images loaded


In [22]:
# VGG16 Pre training
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

features = ipp.extract_features(images,model)
print(f"{len(features)} features loaded")

8091 features loaded


In [23]:
dump(features, open(export_path+'vgg_features.pkl', 'wb'))

# Loading data

In [4]:
class Dataset:
    def __init__(self,name,ids_path,desc_path,features_path,limit=None):
        self.name = name
        self.ids = loader.load_set(captions_train_path)
        
        if limit != None:
            self.ids = self.ids[:limit]
            
        self.descriptions = loader.load_desc_from_file(desc_path,self.ids)
        self.features = loader.load_photo_features(features_path,self.ids)
        
    def summary(self):
        print(f"=========> {self.name} <=========")
        print(f"Dataset: {len(self.ids)} elements")
        print(f"Descriptions: {len(self.descriptions)} elements")
        print(f"Features: {len(self.features)} elements")
        
    def generate_sequence(self,max_length,tokenizer,vocab_size):
        self.X1,self.X2,self.Y = create_sequence(
            tokenizer,
            max_length,
            self.descriptions,
            self.features,
            vocab_size
        )
        print(f"Sequences generated for {self.name}")
        

In [6]:
TRAIN_SIZE = 500
TEST_SIZE = 80

train_set = Dataset(
    "train",captions_train_path,export_path+'descriptions.txt',
    vgg_features, limit = TRAIN_SIZE
)
test_set = Dataset(
    "test",captions_valid_path,export_path+'descriptions.txt',
    vgg_features, limit = TEST_SIZE
)

train_set.summary()
test_set.summary()

Dataset: 500 elements
Descriptions: 500 elements
Features: 500 elements
Dataset: 80 elements
Descriptions: 80 elements
Features: 80 elements


In [7]:
def dict_to_list(dict_):
  descs = []
  for key in dict_.keys():
    [descs.append(desc) for desc in dict_[key]]

  return descs

def create_tokenizer(descriptions):
  desc_lines = dict_to_list(descriptions)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(desc_lines)
  return tokenizer

In [8]:
tokenizer = create_tokenizer(train_set.descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 2199


In [9]:
def get_max_length(descriptions):
  max_ = 0
  for desc_key in descriptions:
    for desc in descriptions[desc_key]:
      if len(desc) > max_:
        max_ = len(desc)

  return max_

def create_sequence(tokenizer,max_length,descriptions,photos,vocab_size):
  X1,X2,Y = [],[],[]
  for photo_id in descriptions:
    for line in descriptions[photo_id]:
      seq = tokenizer.texts_to_sequences([line])[0]
      for word_id in range(1,len(seq)):
        x2 = seq[:word_id]
        x2 = pad_sequences([x2],maxlen=max_length)[0]
        y = seq[word_id]
        y = to_categorical([y], num_classes=vocab_size)[0]

        X1.append(photos[photo_id][0])
        X2.append(x2)
        Y.append(y)
  return np.array(X1),np.array(X2),np.array(Y)

In [10]:
max_length = get_max_length(train_set.descriptions)
train_set.generate_sequence(max_length,tokenizer,vocab_size)
test_set.generate_sequence(max_length,tokenizer,vocab_size)

Sequences generated for train
Sequences generated for test


In [13]:
def get_model(vocab_size,max_length):
  #image branch
  images_input = Input(shape=(4096,))
  image_layer = Dropout(0.5)(images_input)
  image_layer = Dense(256,activation="relu")(image_layer)

  #caption branch
  captions_input = Input(shape=(max_length,))
  caption_layer = Embedding(vocab_size,256, mask_zero = True)(captions_input)
  caption_layer = Dropout(0.5)(caption_layer)
  caption_layer = LSTM(256)(caption_layer)

  #decoding layer
  decoder = add([image_layer,caption_layer])
  decoder = Dense(256,activation='relu')(decoder)
  outputs = Dense(vocab_size,activation='softmax')(decoder)

  # generating model
  model = Model(inputs=[images_input,captions_input],outputs = outputs)
  model.compile(loss='categorical_crossentropy',optimizer ='adam')

  #summary
  print(model.summary())
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [14]:
max_length = get_max_length(train_set.descriptions)
model = get_model(vocab_size,max_length)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 162)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 162, 256)     562944      input_4[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 4096)         0           input_3[0][0]                    
____________________________________________________________________________________________

In [15]:
filepath = model_path+'model-ep{epoch:03d}-loss{loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [16]:
model.fit(
    [train_set.X1, train_set.X2],train_set.Y,
    epochs=20, verbose=2,
    #callbacks=[checkpoint],
    validation_data=([test_set.X1, test_set.X2], test_set.Y)
)

Train on 25496 samples, validate on 4184 samples
Epoch 1/20
25496/25496 - 115s - loss: 5.3425 - val_loss: 4.5609
Epoch 2/20
25496/25496 - 93s - loss: 4.4130 - val_loss: 3.9000
Epoch 3/20
25496/25496 - 94s - loss: 3.8741 - val_loss: 3.4575
Epoch 4/20
25496/25496 - 94s - loss: 3.4837 - val_loss: 3.0005
Epoch 5/20
25496/25496 - 93s - loss: 3.1486 - val_loss: 2.7159
Epoch 6/20
25496/25496 - 93s - loss: 2.8851 - val_loss: 2.4101
Epoch 7/20
25496/25496 - 93s - loss: 2.6583 - val_loss: 2.1926
Epoch 8/20
25496/25496 - 93s - loss: 2.4548 - val_loss: 2.0220
Epoch 9/20
25496/25496 - 94s - loss: 2.3066 - val_loss: 1.8384
Epoch 10/20
25496/25496 - 94s - loss: 2.1626 - val_loss: 1.7210
Epoch 11/20
25496/25496 - 94s - loss: 2.0378 - val_loss: 1.5612
Epoch 12/20
25496/25496 - 98s - loss: 1.9255 - val_loss: 1.5379
Epoch 13/20
25496/25496 - 94s - loss: 1.8380 - val_loss: 1.3864
Epoch 14/20
25496/25496 - 95s - loss: 1.7452 - val_loss: 1.3327
Epoch 15/20
25496/25496 - 94s - loss: 1.6713 - val_loss: 1.2189

<tensorflow.python.keras.callbacks.History at 0x1e4d80e5208>

In [17]:
model.save_weights(model_path+"/model_vgg_weights.h5")

# Evaluating model and predicting

In [18]:
model = get_model(vocab_size,max_length)
model.load_weights(model_path+"/model_vgg_weights.h5")

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 162)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 4096)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 162, 256)     562944      input_6[0][0]                    
__________________________________________________________________________________________________
dropout_4 (Dropout)             (None, 4096)         0           input_5[0][0]                    
____________________________________________________________________________________________

In [19]:
# map an integer to a word
def word_for_id(integer, tokenizer):
	for word, index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'

    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)

        if word is None:
            print("None stop")
            break
        in_text += ' ' + word
        
        if word == 'endseq':
            break
            
    return in_text

In [20]:
desc = generate_desc(model,tokenizer,test_set.features["1191338263_a4fa073154"],max_length)
print(desc)

startseq an old woman sits in subway station endseq


In [25]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import Model


def generate_caption_vgg(img,model,tokenizer,max_length):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    features = model.predict(img,verbose=0)
    desc = generate_desc(model, tokenizer, features, max_length)
    return desc.replace("startseq","").replace("endseq","")


def generate_caption_resnet(img,model,tokenizer,max_length):
    resnet = ResNet50()
    resnet = Model(inputs = resnet.inputs, outputs = resnet.layers[-2].output)
    features = resnet.predict(img,verbose=0)
    desc = generate_desc(model, tokenizer, features, max_length)
    return desc.replace("startseq","").replace("endseq","")    

In [30]:
filename = valid_path+"antoine.jpg"
base = load_img(filename,target_size=(224,224))
base = img_to_array(base)
image = np.expand_dims(base, axis=0)
image = preprocessor_vgg.preprocess_input(image)
desc = generate_caption_vgg(image,model,tokenizer,max_length)

ValueError: Error when checking input: expected input_11 to have 4 dimensions, but got array with shape (1, 4096)

In [None]:
plt.imshow(base.squeeze())
plt.title(desc)
plt.show()

In [28]:
image.shape

(1, 224, 224, 3)