In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
from concurrent.futures import ProcessPoolExecutor  # For parallel processing
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.utils import to_categorical, plot_model

In [None]:
BASE_DIR = '/content/drive/MyDrive/COLAB/Image_Caption_Generator/Dataset'  # Update this to your local directory
WORKING_DIR = '/content/drive/MyDrive/COLAB/Image_Caption_Generator/Working'  # Update this to your local directory

In [None]:
# Extract Image Features
# Load VGG19 Model
model = VGG19(weights='/content/drive/MyDrive/COLAB/Image_Caption_Generator/Working/vgg19_weights_tf_dim_ordering_tf_kernels.h5')

# Restructure model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

# Print model summary
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
features = {}
directory = os.path.join(BASE_DIR, 'Images')

for img_name in tqdm(os.listdir(directory)):
    # Load the image from file
    img_path = os.path.join(directory, img_name)
    image = load_img(img_path, target_size=(224, 224))
    # Convert image pixels to a numpy array
    image = img_to_array(image)
    # Reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # Preprocess image for VGG19
    image = preprocess_input(image)
    # Extract features
    feature = model.predict(image, verbose=0)
    # Get image ID
    image_id = img_name.split('.')[0]
    # Store feature
    features[image_id] = feature


  0%|          | 0/8091 [00:00<?, ?it/s]

In [None]:
# Store features in pickle
# with open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb') as f:
    # pickle.dump(features, f)

In [None]:
# Load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
    features = pickle.load(f)

In [None]:
len(features)

8091

In [None]:
# Load the Captions Data
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

# Now we split and append the captions data with the image
# Create mapping of image to captions
mapping = {}
# Process lines
for line in captions_doc.split('\n'):
    # Split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # Remove the extension from the image ID
    image_id = image_id.split('.')[0]
    # Convert caption list to a string
    caption = " ".join(caption)
    # Create a list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # Store the caption
    mapping[image_id].append(caption)

In [None]:
# Preprocess Text Data
def clean(captions_dict):
    for key, captions in captions_dict.items():
        for i in range(len(captions)):
            # Take one caption at a time
            caption = captions[i]
            # Preprocessing steps
            # Convert to lowercase
            caption = caption.lower()
            # Delete digits, special chars, etc.,
            caption = caption.replace('[^A-Za-z]', '')
            # Delete additional spaces
            caption = caption.replace('\s+', ' ')
            # Add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word) > 1]) + ' endseq'
            captions[i] = caption


In [None]:
# Before preprocess of text
print(mapping['1000268201_693b08cb0e'])

# Preprocess the text
clean(mapping)

# After preprocess of text
print(mapping['1000268201_693b08cb0e'])

['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']
['startseq child in pink dress is climbing up set of stairs in an entry way endseq', 'startseq girl going into wooden building endseq', 'startseq little girl climbing into wooden playhouse endseq', 'startseq little girl climbing the stairs to her playhouse endseq', 'startseq little girl in pink dress going into wooden cabin endseq']


In [None]:
# Next, we will store the preprocessed captions into a list
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

# Processing of Text Data
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size:", vocab_size)

# Get the maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
print("Maximum Caption Length:", max_length)

Vocabulary Size: 8485
Maximum Caption Length: 35


In [None]:
# Store Tokenns in pickle
with open(os.path.join(WORKING_DIR, 'tokenizer.pkl'), 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
# Train Test Split
# After preprocessing the data, now we will train, test, and split
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [None]:
print(len(train))
print(len(test))

7281
810


In [None]:
# Create a data generator to get data in batches (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    while True:
        X1, X2, y = [], [], []
        n = 0
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # Process each caption
            for caption in captions:
                # Encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # Split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # Split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # Pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # Encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # Store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
                if n == batch_size:
                    yield [[np.array(X1), np.array(X2)], np.array(y)]
                    X1, X2, y = [], [], []
                    n = 0

In [None]:
# Model Creation
# Encoder model
# Image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# Sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# Decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Plot the model
#plot_model(model, show_shapes=True)

In [None]:
# Train Model
# Train the model
epochs = 20
batch_size = 32
steps = len(train) // batch_size

for i in range(epochs):
    # Create a data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # Fit for one epoch

    history = model.fit(generator, steps_per_epoch=steps, verbose=1)

    # Print loss and accuracy
    print("Epoch", i + 1," Loss:", history.history['loss'][0])


# You can save the model in the working directory for reuse
# Save the model
# model.save(os.path.join(WORKING_DIR, 'best_model.h5'))

Epoch 1  Loss: 4.897119998931885
Epoch 2  Loss: 3.923624038696289
Epoch 3  Loss: 3.543384075164795
Epoch 4  Loss: 3.296121597290039
Epoch 5  Loss: 3.1069700717926025
Epoch 6  Loss: 2.963958740234375
Epoch 7  Loss: 2.8525118827819824
Epoch 8  Loss: 2.759397029876709
Epoch 9  Loss: 2.6825435161590576
Epoch 10  Loss: 2.614645481109619
Epoch 11  Loss: 2.5580403804779053
Epoch 12  Loss: 2.509533643722534
Epoch 13  Loss: 2.46307110786438
Epoch 14  Loss: 2.424828052520752
Epoch 15  Loss: 2.3888487815856934
Epoch 16  Loss: 2.3546574115753174
Epoch 17  Loss: 2.319345474243164
Epoch 18  Loss: 2.286574602127075
Epoch 19  Loss: 2.258127450942993
Epoch 20  Loss: 2.2309975624084473


  saving_api.save_model(


final_time : 24376.641498088837


In [None]:
from keras.models import load_model
model = load_model(os.path.join(WORKING_DIR, 'best_model.h5'))

# Load tokenizer from pickle
with open(os.path.join(WORKING_DIR, 'tokenizer.pkl'), 'rb') as f:
    tokenizer = pickle.load(f)