In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from keras.applications.inception_v3 import InceptionV3
from keras.layers import GlobalAveragePooling2D

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the JSON files
with open('/content/drive/MyDrive/bangla dataset/train_questions_save (1).json', 'r') as f:
    train_data = json.load(f)

with open('/content/drive/MyDrive/bangla dataset/train_questions_save (1).json', 'r') as f:
    test_data = json.load(f)


In [None]:
# Define image and question data
train_image_ids = []
train_questions = []
train_answers = []
test_image_ids = []
test_questions = []
test_answers = []

In [None]:
for key in train_data:
    train_image_ids.append(train_data[key]['image_id'])
    train_questions.append(train_data[key]['question'])
    train_answers.append(train_data[key]['answer'])

for key in test_data:
    test_image_ids.append(test_data[key]['image_id'])
    test_questions.append(test_data[key]['question'])
    test_answers.append(test_data[key]['answer'])

In [None]:
import pandas as pd

# Create a DataFrame from your data
train_data = pd.DataFrame({
    'Image_ID': train_image_ids,
    'Questions': train_questions,
    'Answers': train_answers
})

# Save the DataFrame to a CSV file
train_data.to_csv('train_data.csv', index=False)

In [None]:
# Create a DataFrame from your data
test_data = pd.DataFrame({
    'Image_ID': test_image_ids,
    'Questions': test_questions,
    'Answers': test_answers
})

# Save the DataFrame to a CSV file
test_data.to_csv('test_data.csv', index=False)

In [None]:
# Load and preprocess images
image_dir = '/content/drive/MyDrive/bangla dataset/Images/'

In [None]:
from PIL import Image

def preprocess_image(image_path):
    img = Image.open(image_path)
    img = img.convert('RGB')
    img = img.resize((224, 224))
    img = img_to_array(img)
    img = img / 255.0
    return img

In [None]:
train_images = [preprocess_image(image_dir + image_id + '.png') for image_id in train_image_ids]
train_images = np.array(train_images)

test_images = [preprocess_image(image_dir + image_id + '.png') for image_id in test_image_ids]
test_images = np.array(test_images)

In [None]:
# Process questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_questions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
train_question_sequences = tokenizer.texts_to_sequences(train_questions)
train_question_sequences = pad_sequences(train_question_sequences, padding='post')

In [None]:
test_question_sequences = tokenizer.texts_to_sequences(test_questions)
test_question_sequences = pad_sequences(test_question_sequences, padding='post')

In [None]:
from keras.applications.vgg16 import VGG16
from keras.layers import GlobalAveragePooling2D

# Define the VGG16 model
image_input = Input(shape=(224, 224, 3))
cnn_model = VGG16(weights='imagenet', include_top=False, input_tensor=image_input)

In [None]:
# Add a Global Average Pooling layer after the VGG16 model
image_features = cnn_model.output
image_features = GlobalAveragePooling2D()(image_features)

In [None]:
# Freeze the layers of the CNN model
for layer in cnn_model.layers:
    layer.trainable = False

In [None]:
# Extract image features
train_image_features = cnn_model.predict(train_images)
test_image_features = cnn_model.predict(test_images)



In [None]:
# Define the LSTM model for processing questions
question_input = Input(shape=(None,))
embedding_layer = Embedding(vocab_size, 300, mask_zero=True)(question_input)
lstm_layer = LSTM(256)(embedding_layer)

# Combine image and question features
combined_features = Concatenate()([image_features, lstm_layer])
output = Dense(512, activation='relu')(combined_features)
output = Dense(vocab_size, activation='softmax')(output)

In [None]:
# Create the final model
model = Model(inputs=[cnn_model.input, question_input], outputs=output)

In [None]:
image_label_mapping ={'বানর বার': 0, 'বসা': 1, 'না': 1, 'না': 2,'মল': 2, 'ঠিক': 3, 'সাদা': 3, 'হ্যাঁ': 4,'না': 5,'হ্যাঁ': 5,'বানর বার': 6, 'কুকুর': 7,'বাদামী': 7, 'বাদামী': 8, 'দুই': 9, 'চার': 10, 'পাঁচ': 11, 'সবুজ': 12, 'বাজানো': 13, 'বিড়াল': 14, 'নীল': 15, 'টেবিল': 16, 'ছেলে': 17, 'রৌদ্রোজ্জ্বল': 18, 'তিন': 19, 'কালো': 20, 'গাছ': 21, 'এক': 22, 'বেঞ্চ': 23, 'ধূসর': 24, 'হলুদ': 25, 'পাখি': 26, 'মেয়ে': 27, 'মদ': 28, 'মানুষ': 29, 'লাল': 30, 'বই': 31, 'হাড়': 32, 'সকার': 33, 'পাই': 34, 'লগ': 35, 'বেসবল': 36, 'ফুটবল': 37, 'কমলা': 38, 'পালঙ্ক': 39, 'দাঁড়িয়ে': 40, '0': 41, 'মহিলা': 42, 'খাবার': 43, 'মেঝে': 44, 'কাঠবিড়াল': 45, 'পাটি': 46, 'ঘুমাচ্ছেন': 47, 'আপেল': 48, 'বাইক': 49, 'উদ্ভিদ': 50, 'কম্বল': 51, 'ঘাস': 52, 'চেয়ার': 53, 'বাম': 54, 'কিছুই না': 55}
label_mapping ={'বানর বার': 0, 'বসা': 1, 'না': 1, 'না': 2,'মল': 2, 'ঠিক': 3, 'সাদা': 3, 'হ্যাঁ': 4,'না': 5,'হ্যাঁ': 5,'বানর বার': 6, 'কুকুর': 7,'বাদামী': 7, 'বাদামী': 8, 'দুই': 9, 'চার': 10, 'পাঁচ': 11, 'সবুজ': 12, 'বাজানো': 13, 'বিড়াল': 14, 'নীল': 15, 'টেবিল': 16, 'ছেলে': 17, 'রৌদ্রোজ্জ্বল': 18, 'তিন': 19, 'কালো': 20, 'গাছ': 21, 'এক': 22, 'বেঞ্চ': 23, 'ধূসর': 24, 'হলুদ': 25, 'পাখি': 26, 'মেয়ে': 27, 'মদ': 28, 'মানুষ': 29, 'লাল': 30, 'বই': 31, 'হাড়': 32, 'সকার': 33, 'পাই': 34, 'লগ': 35, 'বেসবল': 36, 'ফুটবল': 37, 'কমলা': 38, 'পালঙ্ক': 39, 'দাঁড়িয়ে': 40, '0': 41, 'মহিলা': 42, 'খাবার': 43, 'মেঝে': 44, 'কাঠবিড়াল': 45, 'পাটি': 46, 'ঘুমাচ্ছেন': 47, 'আপেল': 48, 'বাইক': 49, 'উদ্ভিদ': 50, 'কম্বল': 51, 'ঘাস': 52, 'চেয়ার': 53, 'বাম': 54, 'কিছুই না': 55}

In [None]:
# Create label-encoded arrays for image and question answers
train_image_labels = [image_label_mapping[label] for label in train_answers]
train_image_labels = np.array(train_image_labels, dtype=np.int32)

train_question_labels = [label_mapping[label] for label in train_answers]
train_question_labels = np.array(train_question_labels, dtype=np.int32)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit([train_images, train_question_sequences], np.array(train_label_encoded), epochs=25, batch_size=16)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
# model.save("/content/drive/MyDrive/h5files/vgg16o.h5")

In [None]:
model.save('/content/drive/MyDrive/vgg16_model.h5')

  saving_api.save_model(


In [None]:
from tensorflow import keras
# Load the model
loaded_model = keras.models.load_model('/content/drive/MyDrive/vgg16_model.h5')

In [None]:
test_answers = [item['answer'] for _, item in test_data.items()]

test_label_encoded = [label_mapping[label] for label in test_answers]
test_label_encoded = np.array(test_label_encoded, dtype=np.int32)

KeyError: ignored

In [None]:
# # Evaluate the model on the test dataset
# test_loss, test_accuracy = model.evaluate([test_images, test_question_sequences], test_label_encoded)

# # Print the test accuracy
# print("Test Accuracy:", test_accuracy)

In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = loaded_model.evaluate([test_images, test_question_sequences], test_label_encoded)

# Print the test accuracy
print("Test Accuracy:", test_accuracy)

In [None]:
# Provide an image and question
input_image_path = '/content/drive/MyDrive/bangla dataset/Images/7.png'
input_question = 'ছবিতে কোন প্রাণী আছে?'

# Set the maximum sequence length
max_question_length = 100

# Preprocess the input image
input_image = preprocess_image(input_image_path)
input_image = np.expand_dims(input_image, axis=0)  # Add batch dimension

# Tokenize and preprocess the input question
input_question_sequence = tokenizer.texts_to_sequences([input_question])
input_question_sequence = pad_sequences(input_question_sequence, maxlen=max_question_length)

# Make prediction on the input
prediction = loaded_model.predict([input_image, input_question_sequence])
predicted_label = np.argmax(prediction)

# Decode the predicted label
label_mapping_reverse = {v: k for k, v in label_mapping.items()}
predicted_answer = label_mapping_reverse[predicted_label]

# Print the predicted answer
print("Predicted Answer:", predicted_answer)

In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate
from keras.applications.inception_v3 import InceptionV3
from keras.layers import GlobalAveragePooling2D
from PIL import Image

# ... (rest of your code)

# Load a pre-trained image classification model (e.g., ResNet50)
image_classifier = ResNet50(weights='imagenet', include_top=True)

# Define a threshold for image classification confidence
image_classification_threshold = 0.5

# Provide an image and question
input_image_path = '/content/drive/MyDrive/bangla dataset/Images/2.png'
input_question = 'কেউ কি সোফায় বসে আছে?'

# Preprocess the input image
input_image = preprocess_image(input_image_path)
input_image = np.expand_dims(input_image, axis=0)  # Add batch dimension

# Classify the input image using the image classification model
image_classification_results = image_classifier.predict(input_image)
image_confidence = np.max(image_classification_results)

# Check if the image is valid and the question is not empty
if image_confidence >= image_classification_threshold and input_question:
    # Tokenize and preprocess the input question
    input_question_sequence = tokenizer.texts_to_sequences([input_question])
    input_question_sequence = pad_sequences(input_question_sequence, maxlen=max_question_length)

    # Make prediction on the input
    prediction = loaded_model.predict([input_image, input_question_sequence])
    predicted_label = np.argmax(prediction)

    # Decode the predicted label
    label_mapping_reverse = {v: k for k, v in label_mapping.items()}
    predicted_answer = label_mapping_reverse[predicted_label]

    # Print the predicted answer
    print("Predicted Answer:", predicted_answer)
else:
    print("Invalid Input! Unable to predict.")




Invalid Input! Unable to predict.


In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, GlobalAveragePooling2D
from PIL import Image
image_classifier = VGG16(weights='imagenet', include_top=True)

# Define a threshold for image classification confidence
image_classification_threshold = 0.5
# Provide an image and question
input_image_path = '/content/drive/MyDrive/bangla dataset/Images/7.png'
input_question = 'ছবিতে কোন প্রাণী আছে?'

# Preprocess the input image
input_image = preprocess_image(input_image_path)
input_image = np.expand_dims(input_image, axis=0)  # Add batch dimension

# Classify the input image using the image classification model
image_classification_results = image_classifier.predict(input_image)
image_confidence = np.max(image_classification_results)

# Check if the image is valid and the question is not empty
if image_confidence >= image_classification_threshold and input_question:
    # Tokenize and preprocess the input question
    input_question_sequence = tokenizer.texts_to_sequences([input_question])
    input_question_sequence = pad_sequences(input_question_sequence, maxlen=max_question_length)

    # Iterate through training images and compare features
    matching_image_id = None
    for i, train_image_feature in enumerate(train_image_features):
        # Compare image features
        feature_distance = np.linalg.norm(train_image_feature - image_features)

        # Set a threshold for similarity
        feature_threshold = 0.2  # You can adjust this threshold

        if feature_distance < feature_threshold:
            matching_image_id = i
            break

    if matching_image_id is not None:
        # Now that a matching image is found, extract the image feature
        matching_image_feature = train_image_features[matching_image_id]

        # Predict the answer based on the input question
        prediction = loaded_model.predict([train_images[matching_image_id:matching_image_id + 1], input_question_sequence])
        predicted_label = np.argmax(prediction)

        # Decode the predicted label
        label_mapping_reverse = {v: k for k, v in label_mapping.items()}
        predicted_answer = label_mapping_reverse[predicted_label]

        # Check if the input question feature and predicted question feature match
        question_feature_distance = np.linalg.norm(matching_image_feature - matching_image_feature)

        if question_feature_distance < 0.1:
            print("Predicted Answer:", predicted_answer)
            # You can also perform additional processing on the matching image as needed
            matching_image_path = train_image_ids[matching_image_id]
            # Load and process the matching image, e.g., matching_image = preprocess_image(matching_image_path)
        else:
            print("Input wrong question for the image.")
    else:
        print("Input wrong image.")
else:
    print("Invalid Input! Unable to predict.")


Invalid Input! Unable to predict.


In [None]:
# Provide an image and question
input_image_path = '/content/drive/MyDrive/bangla dataset/Images/2.png'
input_question = 'কেউ কি সোফায় বসে আছে?'

# Set the maximum sequence length
max_question_length = 100

# Preprocess the input image
input_image = preprocess_image(input_image_path)
input_image = np.expand_dims(input_image, axis=0)  # Add batch dimension

# Tokenize and preprocess the input question
input_question_sequence = tokenizer.texts_to_sequences([input_question])
input_question_sequence = pad_sequences(input_question_sequence, maxlen=max_question_length)

# Make prediction on the input
prediction = model.predict([input_image, input_question_sequence])
predicted_label = np.argmax(prediction)

# Decode the predicted label
label_mapping_reverse = {v: k for k, v in label_mapping.items()}
predicted_answer = label_mapping_reverse[predicted_label]

# Print the predicted answer
print("Predicted Answer:", predicted_answer)


In [None]:
model.save("/content/drive/MyDrive/h5files/vgg16f.h5")

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming you have a function preprocess_image() and a tokenizer
# Also, you need to define the label_mapping dictionary and the model
# Function to check if a string is empty or consists of only spaces
def is_empty_or_spaces(s):
    return s.strip() == ""
input_image_path = input("Enter image path: ").strip()
input_question = input("Enter question: ").strip()

# Check if either image path or question is empty
if is_empty_or_spaces(input_image_path) and is_empty_or_spaces(input_question):
    print("Please enter an image path or a question.")
else:
    # Set the maximum sequence length
    max_question_length = 100

    # Preprocess the input image if not empty
    input_image = None
    if not is_empty_or_spaces(input_image_path):
        input_image = preprocess_image(input_image_path)
        input_image = np.expand_dims(input_image, axis=0)  # Add batch dimension

    # Tokenize and preprocess the input question if not empty
    input_question_sequence = None
    if not is_empty_or_spaces(input_question):
        input_question_sequence = tokenizer.texts_to_sequences([input_question])
        input_question_sequence = pad_sequences(input_question_sequence, maxlen=max_question_length)

    if input_image is not None and input_question_sequence is not None:
        # Make prediction on the input
        prediction = model.predict([input_image, input_question_sequence])
        predicted_label = np.argmax(prediction)

        # Decode the predicted label
        label_mapping_reverse = {v: k for k, v in label_mapping.items()}
        predicted_answer = label_mapping_reverse[predicted_label]

        # Print the predicted answer
        print("Predicted Answer:", predicted_answer)
    else:
        print("Please provide either an image or a question.")