In [10]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image





In [11]:
# Load train data
train_data = []
train_set_path = 'data/vaq2.0.TrainImages.txt'

with open(train_set_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        temp = line.split('\t')
        qa = temp[1].split('?')

        if len(qa) == 3:
            answer = qa[2].strip()
        else:
            answer = qa[1].strip()

        data_sample = {
            'image_path': temp[0][:-2],
            'question': qa[0] + '?',
            'answer': answer
        }
        train_data.append(data_sample)

# Load val data
val_data = []
val_set_path = 'data/vaq2.0.DevImages.txt'

with open(val_set_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        temp = line.split('\t')
        qa = temp[1].split('?')

        if len(qa) == 3:
            answer = qa[2].strip()
        else:
            answer = qa[1].strip()

        data_sample = {
            'image_path': temp[0][:-2],
            'question': qa[0] + '?',
            'answer': answer
        }
        val_data.append(data_sample)

# Load test data
test_data = []
test_set_path = 'data/vaq2.0.TestImages.txt'

with open(test_set_path, "r") as f:
    lines = f.readlines()
    for line in lines:
        temp = line.split('\t')
        qa = temp[1].split('?')

        if len(qa) == 3:
            answer = qa[2].strip()
        else:
            answer = qa[1].strip()

        data_sample = {
            'image_path': temp[0][:-2],
            'question': qa[0] + '?',
            'answer': answer
        }
        test_data.append(data_sample)
classes = set([sample['answer'] for sample in train_data])

classes_to_idx = {cls_name: idx for idx, cls_name in enumerate(classes)}

idx_to_classes = {idx: cls_name for idx, cls_name in enumerate(classes)}


In [16]:
val_data

[{'image_path': 'COCO_val2014_000000262175.jpg',
  'question': 'Is this a designer tie ?',
  'answer': 'no'},
 {'image_path': 'COCO_val2014_000000393284.jpg',
  'question': 'Is this man snowboarding ?',
  'answer': 'yes'},
 {'image_path': 'COCO_val2014_000000000133.jpg',
  'question': 'Is this a child room ?',
  'answer': 'yes'},
 {'image_path': 'COCO_val2014_000000000133.jpg',
  'question': 'Could this be child room ?',
  'answer': 'yes'},
 {'image_path': 'COCO_val2014_000000240323.jpg',
  'question': 'Is this a vegetarian meal ?',
  'answer': 'no'},
 {'image_path': 'COCO_val2014_000000524450.jpg',
  'question': 'Could this be tour bus ?',
  'answer': 'yes'},
 {'image_path': 'COCO_val2014_000000262386.jpg',
  'question': 'Is this a metal toilet ?',
  'answer': 'no'},
 {'image_path': 'COCO_val2014_000000262386.jpg',
  'question': 'Is this a private bathroom ?',
  'answer': 'yes'},
 {'image_path': 'COCO_val2014_000000000294.jpg',
  'question': 'Is this a home kitchen ?',
  'answer': 'ye

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.applications.vgg16 import VGG16, preprocess_input
# from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

# Initialize VGG16 model for image feature extraction
vgg_model = VGG16(weights='imagenet', include_top=False)

# Function to extract image features
image_folder_path = '/space/hotel/bachn/VQA/data/val2014-resised'

def extract_image_features(image_path):
    full_image_path = os.path.join(image_folder_path, image_path)
    image = load_img(full_image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    features = vgg_model.predict(image)
    return features.flatten()


# Initialize TF-IDF for text feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectorizer.fit([sample['question'] for sample in train_data])

# Function to extract text features
def extract_text_features(text):
    return tfidf_vectorizer.transform([text]).toarray().flatten()


In [13]:
def fuse_features(image_path, question):
    image_features = extract_image_features(image_path)
    text_features = extract_text_features(question)
    return np.concatenate([image_features, text_features])


In [14]:
from sklearn.ensemble import RandomForestClassifier

# Prepare training data
X_train = [fuse_features(sample['image_path'], sample['question']) for sample in train_data]
y_train = [classes_to_idx[sample['answer']] for sample in train_data]

# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)




In [15]:
from sklearn.metrics import accuracy_score

# Prepare validation data
X_val = [fuse_features(sample['image_path'], sample['question']) for sample in val_data]
y_val = [classes_to_idx[sample['answer']] for sample in val_data]

# Predict and evaluate
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)


Validation Accuracy: 0.7233606557377049


In [18]:
# Predict and evaluate on training data
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)


Training Accuracy: 0.99987254652052
