In [13]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout, add
base_model = VGG16(weights='imagenet')
vgg=Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

In [14]:
BASE_DIR='Downloads/data'
i=0
IMG_DIR=os.path.join(BASE_DIR, 'Images')
features={}
for img in os.listdir(IMG_DIR):
    if i==100:
        break
    i+=1
    image=load_img(os.path.join(IMG_DIR,img), target_size=(224,224))
    image=preprocess_input(np.expand_dims(img_to_array(image),axis=0))
    features[img.split('.')[0]] = vgg.predict(image,verbose=0)

In [15]:
mapping={}
i=0
for line in open(os.path.join(BASE_DIR,'captions.txt')):
    if i==100:
        break
    i+=1
    parts=line.strip().split(',')
    if len(parts)<2:
        continue
    img,caption = parts[0].split('.')[0], 'startseq'+' '.join(parts[1:]).lower()+'endseq'
    mapping.setdefault(img,[]).append(caption)

In [17]:
all_caps = [c for caps in mapping.values() for c in caps]
tok=Tokenizer()
tok.fit_on_texts(all_caps)
vocab,maxlen = len(tok.word_index)+1, max(len(c.split()) for c in all_caps)



In [42]:
in1 = Input(shape=(4096,))
x1=Dropout(0.4)(in1)
x1=Dense(256,activation='relu')(x1)

in2=Input(shape=(maxlen,))
x2=Embedding(vocab,256,mask_zero=True)(in2)
x2=Dropout(0.4)(x2)
x2=LSTM(256)(x2)
merged=add([x1,x2])
merged=Dense(256,activation='relu')(merged)
output=Dense(vocab,activation='softmax')(merged)

model=Model(inputs=[in1,in2],outputs=output)
model.compile(loss='categorical_crossentropy',optimizer='adam')

dummy_img = np.zeros((1, 4096), dtype='float32')
dummy_seq = np.zeros((1, maxlen), dtype='int32')
model([dummy_img, dummy_seq])

<tf.Tensor: shape=(1, 290), dtype=float32, numpy=
array([[0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828,
        0.00344828, 0.00344828, 0.00344828, 0.00344828, 0.00344828

In [43]:
def gen(keys):
    while True:
        for k in keys:
            for cap in mapping[k]:
                seq = tok.texts_to_sequences([cap])[0]
                if len(seq) < 2:
                    continue
                for i in range(1, len(seq)):
                    x1 = features[k][0]
                    x2 = pad_sequences([seq[:i]], maxlen=maxlen)[0]
                    y = tf.keras.utils.to_categorical([seq[i]], num_classes=vocab)
                    x1 = np.expand_dims(x1, axis=0)
                    x2 = np.expand_dims(x2, axis=0)
                    yield [x1, x2], y

train = list(mapping.keys())[:10]

model.fit(
    gen(train),
    steps_per_epoch=len(train),
    epochs=50,
    verbose=1
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x209903e4e20>

In [44]:
def predict_caption(feat):
    text = 'startseq'
    for _ in range(maxlen):
        seq = pad_sequences([tok.texts_to_sequences([text])[0]], maxlen=maxlen)
        yhat = np.argmax(model.predict([feat, seq], verbose=0))
        word = next((w for w, i in tok.word_index.items() if i == yhat), None)
        if word is None or word == 'endseq': break
        text += ' ' + word
    return text.replace('startseq', '').replace('endseq', '').strip()

# === Test ===
img = list(features.keys())[0]
print("Generated caption:", predict_caption(features[img]))


Generated caption: in in in in
