In [1]:
import clip
import torch
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
import torchvision.transforms as transforms
from sklearn.preprocessing import LabelEncoder

In [2]:
# Hard coded values for test dataset question types
counts = [1,2,3,4,5]
counts = [str(i) for i in counts]
colors = ['black', 'brown', 'red', 'white']
positions = ["bed","blinds","books","bookshelf","cabinet","chair","clothes","curtain","door","garbage_bin","lamp","mirror","photo","picture","pillow","shelves","sofa","table","television","towel","window"]

In [3]:
# Recreate model architecture
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from ResNet_SBERT import ResNet_sBERT
from CLIP_encoder_decoder import CLIP, VQADataset

  from tqdm.autonotebook import tqdm, trange


# Prepare test data

In [5]:
test_data = pd.read_csv('new_data_test.csv')

# Encode answers to labels
le = LabelEncoder()
le.fit(test_data['answer'])
test_data['label'] = le.transform(test_data['answer'])

# Retrieve dataset
img_dir = '../data/images'
# Transform images
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_dataset = VQADataset(test_data, img_dir, image_transforms)
# Create dataloader
test_loader = DataLoader(test_dataset, batch_size=64, num_workers=4, pin_memory=True)

In [11]:
def run_inference(model, data_loader, device):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for images, questions, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            questions = list(questions)
            output = model(images, questions)
            _, predicted = torch.max(output, 1)
            all_predictions.extend(predicted.cpu().numpy())
    return all_predictions


def compute_accuracies(data):
    counts_mask = data['answer'].isin(counts)
    colors_mask = data['answer'].isin(colors)
    positions_mask = data['answer'].isin(positions)
    counts_data = data[counts_mask].reset_index(drop=True)
    colors_data = data[colors_mask].reset_index(drop=True)
    positions_data = data[positions_mask].reset_index(drop=True)
    counts_acc = accuracy_score(counts_data['label'], counts_data['predicted_label'])
    color_acc = accuracy_score(colors_data['label'], colors_data['predicted_label'])
    positions_acc = accuracy_score(positions_data['label'], positions_data['predicted_label'])
    print(f'Counts accuracy: {counts_acc}, Color accuracy: {color_acc}, Positions accuracy: {positions_acc}')

# Test on ResNet_sBERT model


In [12]:
# Run inference to compute test accuracy
model = ResNet_sBERT().to(device)
model.load_state_dict(torch.load('ResNet_SBERT.pth'), weights_only=True)

# Test
model.eval()
predictions = run_inference(model, test_loader, device)
test_data['predicted_label'] = predictions
test_data['predicted_answer'] = le.inverse_transform(predictions)
compute_accuracies(test_data)

  model.load_state_dict(torch.load('ResNet_SBERT.pth'))


Counts accuracy: 0.37037037037037035, Color accuracy: 0.49382716049382713, Positions accuracy: 0.3723756906077348


# Test on CLIP model

In [13]:
# Run inference to compute test accuracy
clip_model, clip_preprocess = clip.load('ViT-B/32', device=device)
clip_model = clip_model.float().to(device)
clip_model.eval()
model = CLIP(clip_model).to(device)
model.load_state_dict(torch.load('CLIP_encoder_decoder.pth'))

# Test
model.eval()
predictions = run_inference(model, test_loader, device)
test_data['predicted_label'] = predictions
test_data['predicted_answer'] = le.inverse_transform(predictions)
compute_accuracies(test_data)

  model.load_state_dict(torch.load('CLIP_encoder_decoder.pth'))


Counts accuracy: 0.38271604938271603, Color accuracy: 0.4567901234567901, Positions accuracy: 0.4220994475138122
