In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from keras.applications.inception_v3 import InceptionV3
from keras.layers import GlobalAveragePooling2D

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the JSON files
with open('/content/drive/MyDrive/bangla dataset/train_questions_save (1).json', 'r') as f:
    train_data = json.load(f)

with open('/content/drive/MyDrive/bangla dataset/train_questions_save (1).json', 'r') as f:
    test_data = json.load(f)


In [None]:
# Define image and question data
train_image_ids = []
train_questions = []
train_answers = []
test_image_ids = []
test_questions = []
test_answers = []

In [None]:
for key in train_data:
    train_image_ids.append(train_data[key]['image_id'])
    train_questions.append(train_data[key]['question'])
    train_answers.append(train_data[key]['answer'])

for key in test_data:
    test_image_ids.append(test_data[key]['image_id'])
    test_questions.append(test_data[key]['question'])
    test_answers.append(test_data[key]['answer'])

In [None]:
# Load and preprocess images
image_dir = '/content/drive/MyDrive/bangla dataset/Images/'

In [None]:
from PIL import Image

def preprocess_image(image_path):
    img = Image.open(image_path)
    img = img.convert('RGB')
    img = img.resize((224, 224))
    img = img_to_array(img)
    img = img / 255.0
    return img

In [None]:
train_images = [preprocess_image(image_dir + image_id + '.png') for image_id in train_image_ids]
train_images = np.array(train_images)

test_images = [preprocess_image(image_dir + image_id + '.png') for image_id in test_image_ids]
test_images = np.array(test_images)

In [None]:
# Process questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_questions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
train_question_sequences = tokenizer.texts_to_sequences(train_questions)
train_question_sequences = pad_sequences(train_question_sequences, padding='post')

In [None]:
test_question_sequences = tokenizer.texts_to_sequences(test_questions)
test_question_sequences = pad_sequences(test_question_sequences, padding='post')

In [None]:
image_input = Input(shape=(224, 224, 3))
cnn_model = InceptionV3(weights='imagenet', include_top=False, input_tensor=image_input)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
# Add a Global Average Pooling layer after the InceptionV3 model
image_features = cnn_model.output
image_features = GlobalAveragePooling2D()(image_features)


In [None]:
# Freeze the layers of the CNN model
for layer in cnn_model.layers:
    layer.trainable = False

In [None]:
# Extract image features
train_image_features = cnn_model.predict(train_images)
test_image_features = cnn_model.predict(test_images)



In [None]:
# Define the LSTM model for processing questions
question_input = Input(shape=(None,))
embedding_layer = Embedding(vocab_size, 300, mask_zero=True)(question_input)
lstm_layer = LSTM(256)(embedding_layer)

# Combine image and question features
combined_features = Concatenate()([image_features, lstm_layer])
output = Dense(512, activation='relu')(combined_features)
output = Dense(vocab_size, activation='softmax')(output)

In [None]:
# Create the final model
model = Model(inputs=[cnn_model.input, question_input], outputs=output)

In [None]:
label_mapping = {'বানর বার': 0, 'বসা': 1, 'না': 2, 'মল': 3, 'ঠিক': 4, 'সাদা': 4, 'হ্যাঁ': 6, 'কুকুর': 7, 'বাদামী': 8, 'দুই': 9, 'চার': 10, 'পাঁচ': 11, 'সবুজ': 12, 'বাজানো': 13, 'বিড়াল': 14, 'নীল': 15, 'টেবিল': 16, 'ছেলে': 17, 'রৌদ্রোজ্জ্বল': 18, 'তিন': 19, 'কালো': 20, 'গাছ': 21, 'এক': 22, 'বেঞ্চ': 23, 'ধূসর': 24, 'হলুদ': 25, 'পাখি': 26, 'মেয়ে': 27, 'মদ': 28, 'মানুষ': 29, 'লাল': 30, 'বই': 31, 'হাড়': 32, 'সকার': 33, 'পাই': 34, 'লগ': 35, 'বেসবল': 36, 'ফুটবল': 37, 'কমলা': 38, 'পালঙ্ক': 39, 'দাঁড়িয়ে': 40, '0': 41, 'মহিলা': 42, 'খাবার': 43, 'মেঝে': 44, 'কাঠবিড়াল': 45, 'পাটি': 46, 'ঘুমাচ্ছেন': 47, 'আপেল': 48, 'বাইক': 49, 'উদ্ভিদ': 50, 'কম্বল': 51, 'ঘাস': 52, 'চেয়ার': 53, 'বাম': 54, 'কিছুই না': 55}

In [None]:
train_label_encoded = [label_mapping[label] for label in train_answers]
train_label_encoded = np.array(train_label_encoded, dtype=np.int32)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit([train_images, train_question_sequences], np.array(train_label_encoded), epochs=15, batch_size=16)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
# model.save("/content/drive/MyDrive/h5files/inception.h5")

In [None]:
model.save('inception_model.h5')

  saving_api.save_model(


In [None]:
import matplotlib.pyplot as plt

In [None]:
test_answers = [item['answer'] for _, item in test_data.items()]

test_label_encoded = [label_mapping[label] for label in test_answers]
test_label_encoded = np.array(test_label_encoded, dtype=np.int32)

In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate([test_images, test_question_sequences], test_label_encoded)

# Print the test accuracy
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9928278923034668


In [None]:
# Provide an image and question
input_image_path = '/content/drive/MyDrive/bangla dataset/Images/2.png'
input_question = 'কেউ কি সোফায় বসে আছে?'

# Set the maximum sequence length
max_question_length = 100

# Preprocess the input image
input_image = preprocess_image(input_image_path)
input_image = np.expand_dims(input_image, axis=0)  # Add batch dimension

# Tokenize and preprocess the input question
input_question_sequence = tokenizer.texts_to_sequences([input_question])
input_question_sequence = pad_sequences(input_question_sequence, maxlen=max_question_length)

# Make prediction on the input
prediction = model.predict([input_image, input_question_sequence])
predicted_label = np.argmax(prediction)

# Decode the predicted label
label_mapping_reverse = {v: k for k, v in label_mapping.items()}
predicted_answer = label_mapping_reverse[predicted_label]

# Print the predicted answer
print("Predicted Answer:", predicted_answer)


Predicted Answer: না


In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 111, 111, 32)         864       ['input_1[0][0]']             
                                                                                                  
 batch_normalization (Batch  (None, 111, 111, 32)         96        ['conv2d[0][0]']              
 Normalization)                                                                                   
                                                                                                  
 activation (Activation)     (None, 111, 111, 32)         0         ['batch_normalization[0][0

In [None]:
# model.save('/content/drive/MyDrive/incep_model.h5')

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming you have a function preprocess_image() and a tokenizer
# Also, you need to define the label_mapping dictionary and the model

# Function to check if a string is empty or consists of only spaces
def is_empty_or_spaces(s):
    return s.strip() == ""

input_image_path = input("Enter image path: ").strip()
# ... (your previous code)

input_question = input("Enter question: ").strip()

# Check if either image path or question is empty
if is_empty_or_spaces(input_image_path) or is_empty_or_spaces(input_question):
    print("Please enter both an image path and a question.")
else:
    # Extract image id from the input_image_path
    image_id = input_image_path.split('/')[-1].split('.')[0]

    # Tokenize and preprocess the input question if not empty
    input_question_sequence = None
    if not is_empty_or_spaces(input_question):
        input_question_sequence = tokenizer.texts_to_sequences([input_question])
        input_question_sequence = pad_sequences(input_question_sequence, maxlen=max_question_length)

    if input_question_sequence is not None:
        # Find the index of the image id in the test_image_ids
        corresponding_index = test_image_ids.index(image_id)
        input_image_features = test_image_features[corresponding_index]

        # Make prediction on the input
        prediction = model.predict([np.array([input_image_features]), input_question_sequence])
        predicted_label = np.argmax(prediction)

        # Decode the predicted label
        predicted_answer = label_mapping_reverse.get(predicted_label, "Unknown")

        # Check if the predicted answer matches any of the actual answers
        if predicted_answer in test_answers:
            # Get the index of the predicted answer in the list of test answers
            predicted_answer_index = test_answers.index(predicted_answer)
            actual_answer = test_answers[predicted_answer_index]

            if actual_answer == predicted_answer:
                print("Predicted Answer:", predicted_answer)
            else:
                print("Wrong question input.")
        else:
            print("Question is unrelated to the image. Please enter a valid question.")
    else:
        print("Please provide a valid question.")