In [None]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [None]:
!git clone https://github.com/Nguyenhieu277/flickr8k.git

Cloning into 'flickr8k'...
remote: Enumerating objects: 8094, done.[K
remote: Total 8094 (delta 0), reused 0 (delta 0), pack-reused 8094 (from 1)[K
Receiving objects: 100% (8094/8094), 1.03 GiB | 17.31 MiB/s, done.
Updating files: 100% (8092/8092), done.


In [None]:
from tensorflow.keras.applications import DenseNet201


model = DenseNet201()
densenet = Model(inputs=model.inputs, outputs=model.layers[-2].output)
densenet.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels.h5
[1m82524592/82524592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


In [None]:
def text_preprocess(text):
    text = text.lower()
    text = text.replace('[^A-Za-z]', '')
    text = text.replace('\s+', ' ')
    text = " ".join([word for word in text.split() if len(text)>1])
    text = '<start> ' + text + ' <end>'
    return text

In [None]:
import pandas as pd

df = pd.read_csv('/content/flickr8k/captions.txt')

df.head(5)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [None]:
df['caption'] = df['caption'].apply(text_preprocess)
captions = df['caption'].to_list()
df.head(5)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,<start> a child in a pink dress is climbing up...
1,1000268201_693b08cb0e.jpg,<start> a girl going into a wooden building . ...
2,1000268201_693b08cb0e.jpg,<start> a little girl climbing into a wooden p...
3,1000268201_693b08cb0e.jpg,<start> a little girl climbing the stairs to h...
4,1000268201_693b08cb0e.jpg,<start> a little girl in a pink dress going in...


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)

images = df['image'].unique().tolist()
n_imgs = len(images)

split_idx = int(0.9 * n_imgs)
train_images = images[:split_idx]
val_images = images[split_idx:]
train = df[df['image'].isin(train_images)]
test = df[df['image'].isin(val_images)]

train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

tokenizer.texts_to_sequences([captions[1]])[0]

[3, 1, 19, 316, 64, 1, 196, 117, 2]

In [None]:
img_size = 224
features = {}
for image in tqdm(df['image'].unique().tolist()):
    img = load_img('/content/flickr8k/Images/' + image, target_size=(img_size,img_size))
    img = img_to_array(img)
    img = img/255.
    img = np.expand_dims(img,axis=0)
    feature = densenet.predict(img, verbose=0)
    features[image] = feature

  0%|          | 0/8091 [00:00<?, ?it/s]

Expected: ['keras_tensor']
Received: inputs=Tensor(shape=(1, 224, 224, 3))


In [None]:
pickle.dump(features, open('features.pkl', 'wb'))
with open('features.pkl', 'rb') as file:
    features=pickle.load(file)

In [None]:
def data_generator(df, X_col, y_col, max_length, tokenizer, vocab_size, batch_size, features, shuffle=True):
    n = len(df)
    while True:
        if shuffle:
            df = df.sample(frac=1).reset_index(drop=True)

        for index in range(0, n, batch_size):
            batch = df.iloc[index:index + batch_size]
            X1, X2, y = list(), list(), list()

            images = batch[X_col].tolist()
            for image in images:
                feature = features[image][0]
                captions = batch.loc[batch[X_col] == image, y_col].tolist()

                for caption in captions:
                    seq = tokenizer.texts_to_sequences([caption])[0]

                    padded_seq = pad_sequences([seq], maxlen=max_length, padding='post')[0]

                    for i in range(1, len(padded_seq)):
                        in_seq, out_seq = padded_seq[:i], padded_seq[i]

                        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                        X1.append(feature)

                        in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                        X2.append(in_seq)
                        y.append(out_seq)

            X1, X2, y = np.array(X1), np.array(X2), np.array(y)
            yield (X1, X2), y

In [None]:
input1 = Input(shape=(1920, ), name = 'image')
input2 = Input(shape = (max_length, ), name = 'text')

img_features = Dropout(0.5)(input1)
img_features = Dense(256, activation = 'relu')(img_features)

text_features = Embedding(vocab_size, 256, mask_zero = True)(input2)
text_features = Dropout(0.5)(text_features)
text_features = LSTM(256)(text_features)

decoder = add([img_features, text_features])
decoder = Dense(256, activation = 'relu')(decoder)
decoder = Dropout(0.5)(decoder)
outputs = Dense(vocab_size, activation = 'softmax')(decoder)

model = Model(inputs = [input1, input2], outputs = outputs)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
model.summary()

In [None]:
train_generator = data_generator(train, 'image', 'caption', max_length, tokenizer, vocab_size, 64, features)
val_generator = data_generator(test, 'image', 'caption', max_length, tokenizer, vocab_size, 64, features)

model.fit(train_generator, epochs=10, steps_per_epoch=len(train)//64, validation_data=val_generator, validation_steps=len(test)//64)

Epoch 1/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 444ms/step - loss: 2.0263 - val_loss: 1.1575
Epoch 2/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 461ms/step - loss: 1.1570 - val_loss: 1.0502
Epoch 3/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 461ms/step - loss: 1.0518 - val_loss: 1.0076
Epoch 4/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 431ms/step - loss: 0.9950 - val_loss: 0.9738
Epoch 5/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 430ms/step - loss: 0.9601 - val_loss: 0.9632
Epoch 6/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 462ms/step - loss: 0.9343 - val_loss: 0.9565
Epoch 7/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 462ms/step - loss: 0.9079 - val_loss: 0.9531
Epoch 8/10
[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 462ms/step - loss: 0.8857 - val_loss: 0.9342
Epoch 9/

<keras.src.callbacks.history.History at 0x7a4188e099d0>

In [None]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

model.save('caption_imgs.keras')