# Visual Question Answering

This code is designed to build and train a Visual Question Answering (VQA) model using Keras and TensorFlow. The model takes an image and a question as input and predicts the answer to the question based on the image content.

In [None]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input, VGG16
from tqdm import tqdm  # Import tqdm for the progress bar
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate,GlobalAveragePooling2D, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [9]:
!pip show keras

Name: keras
Version: 3.1.1
Summary: Multi-backend Keras.
Home-page: https://github.com/keras-team/keras
Author: Keras team
Author-email: keras-users@googlegroups.com
License: Apache License 2.0
Location: /opt/conda/lib/python3.10/site-packages
Requires: absl-py, h5py, ml-dtypes, namex, numpy, optree, rich
Required-by: keras-tuner, tensorflow


# Load Data

it defines a function load_data that reads the CSV files containing the training and evaluation data, including questions, answers, and image IDs. This function loads the images, preprocesses them using the VGG16 model, and returns the questions, answers, and preprocessed images.

In [4]:
def load_data(csv_file, image_dir):
    data = pd.read_csv(csv_file)
    questions = data['question']
    answers = data['answer']

    images = []
    for fname in tqdm(data['image_id'], desc="Loading images"):
        img_path = os.path.join(image_dir, f"{fname}.png")
        img = load_img(img_path, target_size=(224, 224))
        img_array = img_to_array(img)
        images.append(img_array)

    images = preprocess_input(np.array(images))
    return questions, answers, images

# Example usage
data_path = "/kaggle/input/visual-question-answering-dataset"
train_data = os.path.join(data_path, "data_train.csv")
eval_data = os.path.join(data_path, "data_eval.csv")
image_dir = os.path.join(data_path, "images")

train_questions, train_answers, train_images = load_data(train_data, image_dir)
eval_questions, eval_answers, eval_images = load_data(eval_data, image_dir)

Loading images: 100%|██████████| 9974/9974 [02:00<00:00, 82.70it/s] 
Loading images: 100%|██████████| 2494/2494 [00:25<00:00, 98.44it/s] 


# Tokenization

the code preprocesses the text data (questions) by tokenizing the words, converting them to sequences of integers, and padding the sequences to a fixed length.

After that, it encodes the answer labels by creating a mapping dictionary from labels to integers, replacing the text labels with their corresponding integers, and converting the integer-encoded labels to a one-hot encoded format suitable for training the neural network.

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_questions)

# Convert text to sequences
train_seq = tokenizer.texts_to_sequences(train_questions)
eval_seq = tokenizer.texts_to_sequences(eval_questions)

# Pad sequences
max_len = max(len(x) for x in train_seq)
train_seq = pad_sequences(train_seq, maxlen=max_len)
eval_seq = pad_sequences(eval_seq, maxlen=max_len)

In [11]:
# Get the unique answer labels from both training and evaluation sets
all_answers = train_answers.tolist() + eval_answers.tolist()
unique_answers = list(set(all_answers))

# Create a mapping dictionary from labels to integers
label_to_int = {label: i for i, label in enumerate(unique_answers)}

# Replace text labels with integer labels
train_answers_encoded = [label_to_int[label] for label in train_answers]
eval_answers_encoded = [label_to_int[label] for label in eval_answers]

# Update the value of num_classes based on the new unique_answers list
num_classes = len(unique_answers)

# Convert to categorical
train_answers_categorical = to_categorical(train_answers_encoded, num_classes=num_classes)
eval_answers_categorical = to_categorical(eval_answers_encoded, num_classes=num_classes)

# Model Architecture

The code then defines the architecture of the VQA model, which consists of two branches: one for processing the image input and another for processing the text input (questions). The image branch uses the pre-trained VGG16 model to extract features, while the text branch uses an Embedding layer, Bidirectional LSTM, and LSTM layers to process the question sequences. The outputs from both branches are concatenated and passed through dense layers to produce the final output probabilities for each answer class.

The model is compiled with the Adam optimizer and categorical cross-entropy loss function, and the accuracy metric is specified for evaluation.

After defining the model, the code trains it using the preprocessed training data and saves the trained model to a file named 'vaq.h5'.

In [12]:
# Image model
image_input = Input(shape=(224, 224, 3))
image_model = VGG16(include_top=False, weights='imagenet')(image_input)
image_model = Flatten()(image_model)

# Text Model
text_input = Input(shape=(max_len,))
text_model = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100)(text_input)
text_model = Bidirectional(LSTM(256, return_sequences=True))(text_model)
text_model = LSTM(512)(text_model)

# Concatenate models
combined = Concatenate()([image_model, text_model])
combined = Dense(256, activation='relu')(combined)
output = Dense(num_classes, activation='softmax')(combined)

model = Model(inputs=[image_input, text_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [14]:
# Train the model
model.fit(
    [train_images, train_seq],
    train_answers_categorical,
    epochs=1,
    batch_size=8,
    verbose=1
)
model.save('vaq.h5')

[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 160ms/step - accuracy: 0.0403 - loss: 5.9966


ValueError: Unable to synchronously create dataset (name already exists)

In [None]:
print("num_classes:", num_classes)
print("len(unique_answers):", len(unique_answers))

In [31]:
all_answers = list(train_answers.tolist() + eval_answers.tolist())
unique_answers_check = list(set(all_answers))
print("Unique answers (check):", len(unique_answers_check))

if len(unique_answers_check) != len(unique_answers):
    print("Warning: unique_answers list may be incomplete!")
    
    
for label, idx in label_to_int.items():
    if idx < 0 or idx >= len(unique_answers):
        print(f"Warning: Label '{label}' has incorrect index {idx}")

Unique answers (check): 1443


In [8]:
import joblib

# Save preprocess_input
joblib.dump(preprocess_input, 'preprocesse_input.joblib')

# Save unique_answers
joblib.dump(unique_answers, 'unique_answers.joblib')
joblib.dump(tokenizer, 'tokenizer.joblib')

['tokenizer.joblib']

# Model Inference

In [12]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input, VGG16
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Flatten, Concatenate
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.utils import to_categorical


# model = load_model('/kaggle/working/model.h5')
# Function to preprocess the image
def preprocess_image(image_path):
    img = load_img(image_path, target_size=(224, 224))
    img_array = img_to_array(img)
    img_array = preprocess_input(np.array([img_array]))
    return img_array

# Function to preprocess the question
def preprocess_question(question):
    seq = tokenizer.texts_to_sequences([question])
    seq = pad_sequences(seq, maxlen=max_len)
    return seq

# Function to predict the answer
def predict_answer(image_path, question):
    preprocessed_image = preprocess_image(image_path)
    preprocessed_question = preprocess_question(question)

    prediction = model.predict([preprocessed_image, preprocessed_question])
    predicted_class_idx = np.argmax(prediction)
    predicted_answer = unique_answers[predicted_class_idx]

    return predicted_answer

# Example usage
while True:
    image_path = input("Enter the image path: ")
    question = input("Enter the question: ")

    predicted_answer = predict_answer(image_path, question)
    print(f"Predicted answer: {predicted_answer}")

    continue_prompt = input("Do you want to continue? (y/n) ")
    if continue_prompt.lower() != 'y':
        break

Enter the image path:  /kaggle/input/visual-question-answering-dataset/images/image100.png
Enter the question:  what is the object on the shelves


Predicted answer: cup


Do you want to continue? (y/n)  n


In [None]:
model.save('vaq.h5')

In [10]:
max_len

24