In [None]:
import os
import pickle
import numpy as np

from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input

In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [None]:
BASE_DIR='dataset'

## Extract image features

In [None]:
#load vgg16 model
model=VGG16()
#RESTRUCTURE THE MODEL
model=Model(inputs=model.inputs, outputs=model.layers[-2].output)
# we don't need to fully connect layer, we need the previous layer in order to extract the feature results, leaving the last layer and getting the before layer output and assigning it to outputs
#summarize
print(model.summary())

 ### outputs=model.layers[-1].output
 fc1 (Dense)                 (None, 4096)              102764544 
                                                                 
 fc2 (Dense)                 (None, 4096)              16781312  
                                                                 
 predictions (Dense)         (None, 1000)              4097000  
 we don't need the predictions layer, so outputs=model.layers[-2].output

In [None]:
#extract features from image
features={}
directory=os.path.join(BASE_DIR,'images')

for img_name in tqdm(os.listdir(directory)):
    #load image from file
    img_path=directory+'/'+img_name
    image=load_img(img_path,target_size=(224,224))
    #convert image pixels to numpy array
    image=img_to_array(image)
    #reshape data for model
    image=image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))#RGB,has 3 dim
    #preprocess image for vgg
    image=preprocess_input(image)
    #extract features
    feature=model.predict(image,verbose=0)
    #get image id
    image_id=img_name.split('.')[0]#****.jpg, get ****
    #store feature
    features[image_id]=feature
    

In [None]:
WORKING_DIR='working'

In [None]:
#store features in pickle
pickle.dump(features,open(os.path.join(WORKING_DIR,'features.pkl'),'wb'))

In [None]:
#load features from pickle
with open(os.path.join(WORKING_DIR,'features.pkl'),'rb')as f:
    features=pickle.load(f)

## load the Captions Data

In [None]:
with open(os.path.join(BASE_DIR,'captions.txt'),'r')as f:
    next(f)
    captions_doc =f.read()

In [None]:
captions_doc

In [None]:
#create mapping of image to captions
mapping={}
#process lines
for line in tqdm(captions_doc.split('\n')):
    #split the line by comma(,)
    tokens=line.split(',')
    if len(line)<2:
        continue
    image_id,caption=tokens[0],tokens[1:]#1: because A child in a pink dress is climbing up.. has lot columns
    #remove extension from image id
    image_id=image_id.split('.')[0]
    #convert caption list to string
    caption=''.join(caption)
    #create list if needed
    if image_id not in mapping:
        mapping[image_id]=[]
    #store the caption
    mapping[image_id].append(caption)
    

In [None]:
len(mapping)#we do have all

## Preprocess text data

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        #key is the image
        for i in range(len(captions)):
            #take one caption at a time
            caption=captions[i]
            #preprocessing steps
            #convert to lowercase
            caption=caption.lower()
            #delete digits, special chars etc
            caption=caption.replace('[^A-Za-z]','')#remove everything except alpha
            caption=caption.replace('\s+',' ')#instead multiple space, to single space
            #add start and end tags to the caption
            caption='<start>'+" ".join([word for word in caption.split() if len(word)>1])+'<end>'
            captions[i]=caption

In [None]:
##before preprecess of text
mapping['1000268201_693b08cb0e']

In [None]:
#preprocessing the text
clean(mapping)

In [None]:
#after preprocessing the text
mapping['1000268201_693b08cb0e']

In [None]:
all_captions=[]
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
#tokenize the text
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1

In [None]:
vocab_size

In [None]:
#get maximum length of the caption available
max_length=max(len(caption.split()) for caption in all_captions)
max_length

## Train Test split

In [None]:
image_ids=list(mapping.keys())
split=int(len(image_ids)*0.90)
train=image_ids[:split]
test=image_ids[split:]

In [None]:
#<start>girl going into wooden building<end>
#        X                   y
#<start>                    girl
#<start>girl                going   
#<start>girl going          into 
#.......
#<start>girl going into wooden building      <end>  

In [None]:
#create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    #loop over images
    X1,X2,y=list(),list(),list()
    n=0
    while 1:
        for key in data_keys:
            n+=1
            captions=mapping[key]
            #process each caption
            for caption in captions:
                #encode the sequence
                seq=tokenizer.texts_to_sequence([caption])[0]
                #split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    #split into input and output pairs
                    in_seq, out_seq=seq[:i],seq[i]
                    #pad input sequence
                    in_seq=pad_sequences([in_seq],maxlen=max_length)[0] 
                    #encode output sequence
                    out_seq=to_categorical([out_seq],num_classes=vocal_size)[0]
                    
                    #store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1,X2,y=np.array(X1),np.array(X2),np.array(y)
                yield[X1,X2],y
                X1,X2,y=list(),list(),list()
                n=0
                
                

## Model Creation

In [None]:
from tensorflow.keras.utils import plot_model

In [None]:
#encoder model
#image feature layers
inputs1=Input(shape=(4096,))
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)
#sequence feature layers
inputs2=Input(shape=(max_length,))
se1=Embedding(vocab_size,256,mask_zero=True)(inputs2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)

#decoder model
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)

model=Model(inputs=[inputs1,inputs2],outputs=outputs)
model.compile(loss='categorical_crossentropy',optimizer='adam')

#plot the model
plot_model(model,show_shapes=True)