#### all imports

In [1]:
import tensorflow as tf
import os
import numpy as np
import json
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, concatenate
from keras.models import Model
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re
import pickle

2024-06-26 16:15:55.462462: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-26 16:15:55.465270: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-26 16:15:55.502716: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-26 16:15:55.502749: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-26 16:15:55.503776: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

#### read all dataset quetions and answers from json files

In [2]:
train_file_questions = 'datasets/v2_OpenEnded_mscoco_train2014_questions.json'
train_file_annotations = 'datasets/v2_mscoco_train2014_annotations.json'
val_file_questions = 'datasets/v2_OpenEnded_mscoco_val2014_questions.json'
val_file_annotations = 'datasets/v2_mscoco_val2014_annotations.json'
test_file_questions = 'datasets/v2_OpenEnded_mscoco_test2015_questions.json'

with open(train_file_questions, 'r') as f:
    train_questions = json.load(f)['questions']
    f.close()

with open(train_file_annotations, 'r') as f:
    train_annotations = json.load(f)['annotations']
    f.close()

with open(val_file_questions, 'r') as f:
    val_questions = json.load(f)['questions']
    f.close()

with open(val_file_annotations, 'r') as f:
    val_annotations = json.load(f)['annotations']
    f.close()

#with open(test_file_questions, 'r') as f:
#    test_questions = json.load(f)['questions']
#    f.close()

#### read all train and validate image features with IDs from the pkl files

In [3]:
# Read dictionary pkl file
with open('train_image_features_inception.pkl', 'rb') as fp:
    train_imgs_features = pickle.load(fp)
    print('successful')
# Read dictionary pkl file
with open('val_image_features_inception.pkl', 'rb') as fp:
    val_imgs_features = pickle.load(fp)
    print('successful')

successful
successful


#### append validate to train features

In [4]:
train_imgs_features.update(val_imgs_features)

In [5]:
len(train_imgs_features)

123287

#### append validate questions and answers to train questions and answers

In [6]:
# Combine the training and validation questions and annotations
train_questions = val_questions
train_annotations = val_annotations

#### encode questions and answers, and create image features list 

In [7]:
# Extract the questions and answers
questions = []
answers = []
features_id = []

for i in range(len(train_questions)):
    questions.append(train_questions[i]['question'])
    answers.append(train_annotations[i]['multiple_choice_answer'])
    features_id.append(train_questions[i]["image_id"])

# Tokenize the questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(questions)

max_question_length = 30
padded_sequences = pad_sequences(sequences, maxlen=max_question_length)

# Convert the answers to one-hot vectors
answers_tokenizer = Tokenizer()
answers_tokenizer.fit_on_texts(answers)
answer_word_index = answers_tokenizer.word_index
num_classes = len(answer_word_index)
answer_sequences = answers_tokenizer.texts_to_sequences(answers)

# Pad the answer sequences to ensure they all have the same length
max_answer_length = max(len(seq) for seq in answer_sequences)
padded_answers = pad_sequences(answer_sequences, maxlen=max_answer_length)

# Get the unique answers in the dataset and create a dictionary to map them to integer labels
unique_answers = list(set(answers))
label_map = {answer: i for i, answer in enumerate(unique_answers)}

# Convert the answers to integer labels and then to one-hot vectors
labels = [label_map[answer] for answer in answers]
one_hot_answers = to_categorical(labels, num_classes=len(unique_answers))

#### empty some memory to prevent memory overflow

In [8]:
del train_file_questions
del train_file_annotations
del val_file_questions
del val_file_annotations
del test_file_questions
del train_questions
del train_annotations
del val_questions
del val_annotations
del val_imgs_features

In [9]:
del tokenizer
del sequences
del answers_tokenizer
del answer_word_index
del max_answer_length
del padded_answers
del label_map
del labels

In [10]:
del questions
del answers

#### shape of the dataset

In [11]:
print(len(features_id))
print(padded_sequences.shape)
print(one_hot_answers.shape)

214354
(214354, 30)
(214354, 14008)


### split train into 70% train and 30% test 
#### (inplace to prevent memory overflow)

In [12]:
split_indices = np.random.randint(low=0, high=len(features_id), size=int(len(features_id)*0.3))
split_indices = sorted(split_indices,reverse=True) 

In [13]:
test_padded_sequences = []
padded_sequences = list(padded_sequences)
for i in split_indices:
    test_padded_sequences.append(padded_sequences.pop(i))

In [14]:
test_one_hot_answers = []
one_hot_answers = list(one_hot_answers)
for i in split_indices:
    test_one_hot_answers.append(one_hot_answers.pop(i))

In [15]:
test_features_id = []
for i in split_indices:
    test_features_id.append(features_id.pop(i))

In [16]:
len(unique_answers)

14008

### split 30% test into 20% test and 10% validate
#### (inplace to prevent memory overflow)

In [17]:
split_indices = np.random.randint(low=0, high=len(test_features_id), size=int(len(test_features_id)*0.3))
split_indices = sorted(split_indices,reverse=True)

In [18]:
val_padded_sequences = []
for i in split_indices:
    val_padded_sequences.append(test_padded_sequences.pop(i))

In [19]:
val_one_hot_answers = []
for i in split_indices:
    val_one_hot_answers.append(test_one_hot_answers.pop(i))

In [20]:
val_features_id = []
for i in split_indices:
    val_features_id.append(test_features_id.pop(i))

#### Second Model: GRU for questions and concat with inception v3 output
##### uses nadam optimizer

In [21]:
# Define the input layers
question_input = Input(shape=(max_question_length, ), name='question_input')
image_input = Input(shape=(2048, ), name='image_input')

# Define the embedding layer for the questions
question_embedding = Embedding(input_dim=len(word_index)+1, output_dim=300, input_length=max_question_length,
                               name='question_embedding')(question_input)

# Define the LSTM layer for the questions
question_lstm = tf.keras.layers.GRU(units=256, name='question_lstm',return_sequences = True)(question_embedding)
question_lstm = Dropout(0.2, name='question_dropout')(question_lstm)
question_lstm2 = tf.keras.layers.GRU(units=256, name='question_lstm2')(question_lstm)
question_lstm2 = Dropout(0.2, name='question_dropout2')(question_lstm2)

# Define the dense layer for the image features
image_dense = Dense(units=256, activation='relu', name='image_dense')(image_input)
image_dense = Dropout(0.2, name='image_dropout')(image_dense)

# Concatenate the output from the LSTM and dense layers
dense_1 = concatenate([question_lstm2, image_dense], name='concatenated')
dense_2 = Dense(512, activation='relu')(dense_1)
dense_3 = Dense(256, activation='relu')(dense_2)
# Define the output layer for the classification

output = Dense(units=len(unique_answers), activation='softmax', name='output')(dense_3)

# Define the model
model = Model(inputs=[question_input, image_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

In [22]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 question_input (InputLayer  [(None, 30)]                 0         []                            
 )                                                                                                
                                                                                                  
 question_embedding (Embedd  (None, 30, 300)              3182100   ['question_input[0][0]']      
 ing)                                                                                             
                                                                                                  
 question_lstm (GRU)         (None, 30, 256)              428544    ['question_embedding[0][0]']  
                                                                                              

#### creating a custom generator

In [23]:
def data_generator(image_features, padded_questions, labels, batch_size):
    num_samples = len(labels)
    steps_per_epoch = num_samples // batch_size
    while True:
        for i in range(steps_per_epoch):
            batch_image_features = []
            for j in  image_features[i*batch_size:(i+1)*batch_size]:
                batch_image_features.append(train_imgs_features[j])
            batch_padded_questions = padded_questions[i*batch_size:(i+1)*batch_size]
            batch_labels = labels[i*batch_size:(i+1)*batch_size]
            yield [np.asarray(batch_padded_questions), np.asarray(batch_image_features)], np.asarray(batch_labels)

#### training the model using 70% train and 10% validate

In [24]:
batch_size = 128
steps_per_epoch = len(one_hot_answers) // batch_size
model.fit(data_generator(features_id,padded_sequences,one_hot_answers,batch_size),
          steps_per_epoch=steps_per_epoch,
          epochs=50,
         validation_data = data_generator(val_features_id,val_padded_sequences,val_one_hot_answers,batch_size),
         validation_steps = int(len(val_features_id)/batch_size))

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7ff5d86c73d0>

#### saving the model

In [25]:
model.save("inc_GRU_Nadam.h5")

  saving_api.save_model(


#### end