#### all imports

In [2]:
import tensorflow as tf
import os
import numpy as np
import json
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, concatenate
from keras.models import Model
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re
import pickle

2024-06-26 15:22:27.676602: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-26 15:22:27.804729: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-26 15:22:28.391884: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-26 15:22:28.391965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-26 15:22:28.517709: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

#### read all dataset questions and answers from json files

In [3]:
train_file_questions = 'datasets/v2_OpenEnded_mscoco_train2014_questions.json'
train_file_annotations = 'datasets/v2_mscoco_train2014_annotations.json'
val_file_questions = 'datasets/v2_OpenEnded_mscoco_val2014_questions.json'
val_file_annotations = 'datasets/v2_mscoco_val2014_annotations.json'
test_file_questions = 'datasets/v2_OpenEnded_mscoco_test2015_questions.json'


with open(train_file_questions, 'r') as f:
    train_questions = json.load(f)['questions']
    f.close()

with open(train_file_annotations, 'r') as f:
    train_annotations = json.load(f)['annotations']
    f.close()

with open(val_file_questions, 'r') as f:
    val_questions = json.load(f)['questions']
    f.close()

with open(val_file_annotations, 'r') as f:
    val_annotations = json.load(f)['annotations']
    f.close()

#with open(test_file_questions, 'r') as f:
#    test_questions = json.load(f)['questions']
#    f.close()

#### read all train and validate image features extracted by VGG19 with IDs from the pkl files

In [4]:
# Read dictionary pkl file
with open('vgg19_train_data.pkl', 'rb') as fp:
    train_imgs_features = pickle.load(fp)
    print('successful')
# Read dictionary pkl file
with open('vgg19_val_data.pkl', 'rb') as fp:
    val_imgs_features = pickle.load(fp)
    print('successful')

successful
successful


#### append validate to train features

In [5]:
train_imgs_features.update(val_imgs_features)

In [6]:
len(train_imgs_features)

123287

#### append validate questions and answers to train questions and answers

In [7]:
# # Combine the training and validation questions and annotations
train_questions = val_questions
train_annotations = val_annotations

#### encode questions and answers, and create image features list

In [8]:
del val_questions
del val_annotations
del val_imgs_features

In [9]:
# Extract the questions and answers
questions = []
answers = []
features_id = []

for i in range(len(train_questions)):
    questions.append(train_questions[i]['question'])
    answers.append(train_annotations[i]['multiple_choice_answer'])
    features_id.append(train_questions[i]["image_id"])


del train_questions
del train_annotations

# Tokenize the questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(questions)

del questions
del tokenizer

max_question_length = 30
padded_sequences = pad_sequences(sequences, maxlen=max_question_length)

del sequences

# Convert the answers to one-hot vectors
answers_tokenizer = Tokenizer()
answers_tokenizer.fit_on_texts(answers)
answer_word_index = answers_tokenizer.word_index
num_classes = len(answer_word_index)
answer_sequences = answers_tokenizer.texts_to_sequences(answers)


del answers_tokenizer

# Pad the answer sequences to ensure they all have the same length
max_answer_length = max(len(seq) for seq in answer_sequences)
padded_answers = pad_sequences(answer_sequences, maxlen=max_answer_length)

# Get the unique answers in the dataset and create a dictionary to map them to integer labels
unique_answers = list(set(answers))
label_map = {answer: i for i, answer in enumerate(unique_answers)}

del answer_sequences

# Convert the answers to integer labels and then to one-hot vectors
labels = [label_map[answer] for answer in answers]
one_hot_answers = to_categorical(labels, num_classes=len(unique_answers))

In [10]:
unique_answers

['under elephant',
 'air freshener',
 'public market',
 'looking at cell phone',
 'keep calm',
 'content',
 '70 inch',
 'in case',
 'pan is hot',
 'he is tall',
 'man in orange',
 'female impersonators',
 'paperwork',
 'no dress',
 'maine',
 'huge',
 'elmira',
 'seed',
 'cleveland',
 'couple',
 'butt cheek',
 'kicking',
 'grace',
 'not ripe',
 'east 34th st',
 'holding bat',
 'family',
 'brownie',
 'haight and ashbury',
 'towel hook',
 'destin, fl',
 'greenhouse',
 'black and white flowers',
 'denim',
 'catch frisbee',
 'mallet',
 'skateboard man',
 'fettuccine',
 'nightstand',
 'esda',
 '530264',
 'black/silver',
 'button',
 'not there',
 '200',
 'doughnut shop',
 'badminton',
 'helping her up',
 'hot dog menu',
 'monkeys',
 'narrowing road',
 'casserole',
 'behind meters',
 'easton',
 'doug schaefer',
 'mk home',
 'semi',
 'front of computer',
 'one playing game; other holding hands together',
 'cut them up',
 'father and daughter',
 'hurricane',
 'baseball glove',
 'grass',
 'rollin

In [13]:
label_map

{'under elephant': 0,
 'air freshener': 1,
 'public market': 2,
 'looking at cell phone': 3,
 'keep calm': 4,
 'content': 5,
 '70 inch': 6,
 'in case': 7,
 'pan is hot': 8,
 'he is tall': 9,
 'man in orange': 10,
 'female impersonators': 11,
 'paperwork': 12,
 'no dress': 13,
 'maine': 14,
 'huge': 15,
 'elmira': 16,
 'seed': 17,
 'cleveland': 18,
 'couple': 19,
 'butt cheek': 20,
 'kicking': 21,
 'grace': 22,
 'not ripe': 23,
 'east 34th st': 24,
 'holding bat': 25,
 'family': 26,
 'brownie': 27,
 'haight and ashbury': 28,
 'towel hook': 29,
 'destin, fl': 30,
 'greenhouse': 31,
 'black and white flowers': 32,
 'denim': 33,
 'catch frisbee': 34,
 'mallet': 35,
 'skateboard man': 36,
 'fettuccine': 37,
 'nightstand': 38,
 'esda': 39,
 '530264': 40,
 'black/silver': 41,
 'button': 42,
 'not there': 43,
 '200': 44,
 'doughnut shop': 45,
 'badminton': 46,
 'helping her up': 47,
 'hot dog menu': 48,
 'monkeys': 49,
 'narrowing road': 50,
 'casserole': 51,
 'behind meters': 52,
 'easton': 5

#### empty some memory to prevent memory overflow

In [9]:
del labels
del label_map
# del answers_tokenizer
del answer_word_index
del padded_answers
del answers

#### shape of the dataset

In [10]:
print(len(features_id))
print(padded_sequences.shape)
print(one_hot_answers.shape)

214354
(214354, 30)
(214354, 14008)


### split train into 70% train and 30% test
#### (inplace to prevent memory overflow)

In [11]:
split_indices = np.random.randint(low=0, high=len(features_id), size=int(len(features_id)*0.3))
split_indices = sorted(split_indices,reverse=True)

In [12]:
test_padded_sequences = []
padded_sequences = list(padded_sequences)
for i in split_indices:
    test_padded_sequences.append(padded_sequences.pop(i))

In [13]:
test_one_hot_answers = []
one_hot_answers = list(one_hot_answers)
for i in split_indices:
    test_one_hot_answers.append(one_hot_answers.pop(i))

In [14]:
test_features_id = []
for i in split_indices:
    test_features_id.append(features_id.pop(i))

### split 30% test into 20% test and 10% validate
#### (inplace to prevent memory overflow)

In [15]:
split_indices = np.random.randint(low=0, high=len(test_features_id), size=int(len(test_features_id)*0.3))
split_indices = sorted(split_indices,reverse=True)

In [16]:
val_padded_sequences = []
for i in split_indices:
    val_padded_sequences.append(test_padded_sequences.pop(i))

In [17]:
val_one_hot_answers = []
for i in split_indices:
    val_one_hot_answers.append(test_one_hot_answers.pop(i))

In [18]:
val_features_id = []
for i in split_indices:
    val_features_id.append(test_features_id.pop(i))

#### first model: LSTM for Questions and concat with inception v3 output
##### uses SGD with momentum optimizer

In [19]:
# Define the input layers
question_input = Input(shape=(max_question_length, ), name='question_input')
image_input = Input(shape=(25088, ), name='image_input')

# Define the embedding layer for the questions
question_embedding = Embedding(input_dim=len(word_index)+1, output_dim=300, input_length=max_question_length,
                               name='question_embedding')(question_input)

# Define the LSTM layer for the questions
question_lstm = LSTM(units=512, name='question_lstm', return_sequences=True)(question_embedding)
question_lstm = Dropout(0.3, name='question_dropout')(question_lstm)

question_lstm2 = LSTM(units=256, name='question_lstm2')(question_lstm)
question_lstm2 = Dropout(0.2, name='question_dropout2')(question_lstm2)

# Define the dense layer for the image features
image_dense = Dense(units=256, activation='relu', name='image_dense')(image_input)
image_dense = Dropout(0.2, name='image_dropout')(image_dense)

# Concatenate the output from the LSTM and dense layers
concatenated = concatenate([question_lstm2, image_dense], name='concatenated')

dense_cnc = Dense(units=512, activation='relu', name='dens_conc')(concatenated)
dense_cnc2 = Dense(units=512, activation='relu', name='dens_conc2')(dense_cnc)
# Define the output layer for the classification
output = Dense(units=len(unique_answers), activation='softmax', name='output')(dense_cnc2)

# Define the model
model = Model(inputs=[question_input, image_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=0.001, momentum=0.9), metrics=['accuracy'])



In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 question_input (InputLayer  [(None, 30)]                 0         []                            
 )                                                                                                
                                                                                                  
 question_embedding (Embedd  (None, 30, 300)              3182100   ['question_input[0][0]']      
 ing)                                                                                             
                                                                                                  
 question_lstm (LSTM)        (None, 30, 512)              1665024   ['question_embedding[0][0]']  
                                                                                              

#### creating a custom generator

In [21]:
def data_generator(image_features, padded_questions, labels, batch_size):
    num_samples = len(labels)
    steps_per_epoch = num_samples // batch_size
    while True:
        for i in range(steps_per_epoch):
            batch_image_features = []
            for j in  image_features[i*batch_size:(i+1)*batch_size]:
                batch_image_features.append(train_imgs_features[j])
            batch_padded_questions = padded_questions[i*batch_size:(i+1)*batch_size]
            batch_labels = labels[i*batch_size:(i+1)*batch_size]
            yield [np.asarray(batch_padded_questions), np.asarray(batch_image_features)], np.asarray(batch_labels)

#### training the model using 70% train and 10% validate

In [22]:
batch_size = 128
steps_per_epoch = len(one_hot_answers) // batch_size
model.fit(data_generator(features_id,padded_sequences,one_hot_answers,batch_size),
          steps_per_epoch=steps_per_epoch,
          epochs=10,
         validation_data = data_generator(val_features_id,val_padded_sequences,val_one_hot_answers,batch_size),
         validation_steps = int(len(val_features_id)/batch_size))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f9ab20a5410>

#### saving the model

In [23]:
model.save("VGG19_LSTM_SGD.h5")

  saving_api.save_model(


#### end