In [None]:
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
!unzip Flickr8k_Dataset.zip
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
!unzip Flickr8k_text.zip
!wget https://github.com/uclnlp/inferbeddings/raw/master/data/glove/glove.6B.50d.txt.gz
!gunzip glove.6B.50d.txt.gz

In [35]:
!pip install opencv-contrib-python



In [92]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
import json
from sklearn.model_selection import train_test_split


In [30]:
dataset_path = "/content/Flicker8k_Dataset"
files = os.listdir(dataset_path)

Flicker8k dataset has **8091** images. We will resize all to **224x224** pixels before start.

In [46]:
images = []
names = []

for file in files:
    image = cv2.imread(dataset_path + "/" + file)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (224,224))
    images.append(image)
    names.append(file)

In [50]:
df = pd.read_csv("/content/sentences.csv")

In [76]:
def preprocess(sentence):
    pattern = r'[^a-zA-z0-9\s]'
    sentence = re.sub(pattern,'',sentence)
    words = sentence.split()

    words_sm = [word.lower() for word in words]
    words_la = [word for word in words_sm if len(word) > 1]
    words_f = [word for word in words_la if word.isalpha()]

    result_sentence = ""
    for word in words_f:
        result_sentence = result_sentence + word + " "

    return result_sentence

In [77]:
df['description'] = df['description'].apply(preprocess)

Creating a **dictionary** with key file name and values with image and descriptions.

In [78]:
mapping = {}
idx = 0
while idx < len(df):
  if df.iloc[idx]['image'] in mapping.keys():
      mapping[df.iloc[idx]['image']]['desc'].append(df.iloc[idx]['description'])
  else:
      mapping[df.iloc[idx]['image']] = {}
      mapping[df.iloc[idx]['image']]['desc'] = [df.iloc[idx]['description']]
  idx+=1

In [79]:
keys = [key for key in mapping.keys()]
for key in keys:
    try:
        ind = names.index(key)
        mapping[key]['image'] = images[ind]
    except:
        mapping.pop(key,None)

In [80]:
input_images = []
input_descs = []
for key in mapping.keys():
    input_images.append(mapping[key]['image'])
    input_descs.append(mapping[key]['desc'])

Creating frequency **dictionary**

In [81]:
word_map = {}
word_idx = 0
for description in input_descs:
    for sentence in description:
        words = sentence.split()
        for word in words:
            if word in word_map.keys():
                word_map[word]['freq'] = word_map[word]['freq'] + 1
            else:
                word_map[word] = {}
                word_map[word]['idx'] = word_idx
                word_map[word]['freq'] = 1
                word_idx += 1

In [86]:
out_file = open("/content/word_freq.json", "w")
json.dump(word_map,out_file)

In [89]:
new_input_descs = []
for description in input_descs:
    new_description = []
    for sentence in description:
        words = sentence.split()
        new_words = []
        for word in words:
            new_words.append(word_map[word]['idx'])
        new_description.append(new_words)
    new_input_descs.append(new_description)

Train and test data split

In [93]:
#input_images, new_input_descs
#train 80%, test 20%
X_train, X_test, Y_train, Y_test = train_test_split(input_images, new_input_descs, test_size = 0.2)

Import **Tensorflow** NN layers

In [109]:
from tensorflow.keras.layers import Embedding,Dense, Activation, MaxPool1D,Input, LSTM, Dropout, Input,Activation,add,MaxPooling2D,Conv2D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from keras.utils.vis_utils import plot_model

Creating embedding matrix for the word vectors. For embedding I have used **Glove6B** by the standford NLP

In [127]:
embedding_dim = 50
vocab_length = len(word_map) + 1
embedding_mat = np.zeros((vocab_length, embedding_dim))

In [128]:
with open('/content/glove.6B.50d.txt') as g_file:
    for line in g_file:
        word, *embedding = line.split()
        if word in word_map.keys():
            embedding_mat[word_map[word]['idx']] = np.array(embedding, dtype="float32")[:embedding_dim]

In [149]:
def Convolution(input_tensor,filters):
    x = Conv2D(filters = filters, kernel_size = (3, 3), padding = 'same',strides = (1, 1),
                kernel_regularizer = l2(0.003))(input_tensor)
    x = Dropout(0.1)(x)
    x= Activation('relu')(x)
    return x

def create_model(input_shape, vocab_length, embedding_mat, embedding_dim):
    inputs_images = Input((input_shape))
    conv_1 = Convolution(inputs_images, 32)
    maxp_1 = MaxPooling2D(pool_size = (2,2))(conv_1)
    conv_2 = Convolution(maxp_1, 64)
    maxp_2 = MaxPooling2D(pool_size = (2, 2))(conv_2)
    #conv_3 = Convolution(maxp_2, 128)
    #maxp_3 = MaxPooling2D(pool_size = (2, 2))(conv_3)
    flatten = Flatten()(maxp_2)
    dense_1 = Dense(128, activation = 'relu')(flatten) 

    #NOTE: dense_1 size should be equal to lstm1 size

    inputs_language = Input(shape = (25,)) #final length
    emb1 = Embedding(input_dim = vocab_length, output_dim = embedding_dim, weights = [embedding_mat], trainable = False)(inputs_language)
    dr1 = Dropout(0.2)(emb1)
    lstm1 = LSTM(128, return_sequences = True)(dr1)
    #dr2 = Dropout(0.2)(lstm1)
    #lstm2 = LSTM(256,return_sequences=True)(dr2)

    out_1 = add([dense_1, lstm1])
    out_2 = Dense(256, activation = 'relu')(out_1)
    output = Dense(vocab_length, activation = 'softmax')(out_2)

    model = Model(inputs = [inputs_images, inputs_language], outputs = [output])
    
    #need update of loss function to 
    model.compile(loss = "categorical_crossentropy", optimizer = "Adam")

    return model

In [150]:
model = create_model((224, 224, 3), vocab_length, embedding_mat, embedding_dim)

In [151]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv2d_30 (Conv2D)              (None, 224, 224, 32) 896         input_28[0][0]                   
__________________________________________________________________________________________________
dropout_43 (Dropout)            (None, 224, 224, 32) 0           conv2d_30[0][0]                  
__________________________________________________________________________________________________
activation_30 (Activation)      (None, 224, 224, 32) 0           dropout_43[0][0]                 
____________________________________________________________________________________________

In [None]:
#Plot model architecture
plot_model(model, to_file = '/content/model_plot.png', show_shapes = True, show_layer_names = True)