In [2]:
import warnings
import torch.nn as nn
import torch
from transformers import (
    # Preprocessing / Common
    AutoTokenizer, AutoFeatureExtractor,
    # Text & Image Models (Now, image transformers like ViTModel, DeiTModel, BEiT can also be loaded using AutoModel)
    AutoModel,
    # Training / Evaluation
    TrainingArguments, Trainer,
    # Misc
    logging
)
from typing import Dict, List, Optional, Tuple
import os
import json
VQAV2_FILEPATH = "./VQAv2 Annotations Preprocessed"
with open(f"{VQAV2_FILEPATH}/VQAv2_answer_mapping.json", "r") as json_file:
    answer_to_id = json.load(json_file)
id_to_answer = {v: k for k, v in answer_to_id.items()}
answer_space = list(answer_to_id.keys())
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
warnings.filterwarnings('ignore')

class MultimodalVQAModel_modified_GELU_noDropout(nn.Module):
    def __init__(
            self,
            num_labels: int = len(answer_space),
            intermediate_dim: int = 512,
            pretrained_text_name: str = 'roberta-base',
            pretrained_image_name: str = 'google/vit-base-patch16-224-in21k'):

        super(MultimodalVQAModel_modified_GELU_noDropout, self).__init__()
        self.num_labels = num_labels
        self.pretrained_text_name = pretrained_text_name
        self.pretrained_image_name = pretrained_image_name

        self.text_encoder = AutoModel.from_pretrained(
            self.pretrained_text_name,
        )
        self.image_encoder = AutoModel.from_pretrained(
            self.pretrained_image_name,
        )

        self.encoders_hidden_size = self.text_encoder.config.hidden_size + \
            self.image_encoder.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(self.encoders_hidden_size,
                      self.encoders_hidden_size*1),
            nn.LayerNorm(self.encoders_hidden_size*1),
            nn.GELU(),
            # nn.Dropout(0.5),
            nn.Linear(self.encoders_hidden_size*1, self.num_labels)
        )

        self.criterion = nn.CrossEntropyLoss()

    def forward(
            self,
            input_ids: torch.LongTensor,
            pixel_values: torch.FloatTensor,
            attention_mask: Optional[torch.LongTensor] = None,
            token_type_ids: Optional[torch.LongTensor] = None,
            labels: Optional[torch.LongTensor] = None):

        encoded_text = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True,
        )
        encoded_image = self.image_encoder(
            pixel_values=pixel_values,
            return_dict=True,
        )
        logits = self.classifier(
            torch.cat(
                [
                    encoded_text['pooler_output'],
                    encoded_image['pooler_output'],
                ],
                dim=1
            )
        )
        out = {
            "logits": logits
        }
        if labels is not None:
            loss = self.criterion(logits, labels)
            out["loss"] = loss

        return out


def createMultimodalVQAModel(text='roberta-base', image='google/vit-base-patch16-224-in21k'):
    tokenizer = AutoTokenizer.from_pretrained(text)
    preprocessor = AutoFeatureExtractor.from_pretrained(image)


    multi_model = MultimodalVQAModel_modified_GELU_noDropout(pretrained_text_name=text, pretrained_image_name=image).to(device)

    return multi_model


def get_step(checkpoint):
    return int(checkpoint[11:])

model_dir = 'Checkpoint_VQA_3129_Dropped_NaN'
model_folder = 'roberta_base_g_vit_hidden_x1_GELU_NoDrop_yes_no_untouched'
checkpoint_list = [cp for cp in os.listdir(os.path.join(
    model_dir, model_folder)) if cp.find('checkpoint-') != -1]
latest_cp = max(list(map(get_step, checkpoint_list)))
trained_path = os.path.join(
    model_dir, model_folder, f'checkpoint-{latest_cp}', 'pytorch_model.bin')
model = createMultimodalVQAModel()
model.load_state_dict(torch.load(trained_path))
model.to(device)
model.eval()
print("model initialised!")

2023-04-15 23:00:55.827703: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-15 23:00:56.657578: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-15 23:00:57.603144: W tensorflow/tsl/platform/default/dso_loader.cc:66] Could not load dynamic library 'libnvinfer.so.8'; dlerror: libnvinfer.so.8: cannot open shared object file: No such file or directory
2023-04-15 23:00:57.603210: W tensorflow/tsl/platform/default/dso_loader.cc:66] Could not load dynamic library 'libnvinfer_plugin.so.8'; dler

model initialised!


In [3]:
import gradio as gr
import os
from PIL import Image



def tokenize_text(texts: List[str]):
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    encoded_text = tokenizer(
        text=texts,
        padding='longest',
        max_length=24,
        truncation=True,
        return_tensors='pt',
        return_token_type_ids=True,
        return_attention_mask=True,
    )
    return {
        "input_ids": encoded_text['input_ids'].squeeze(),
        "token_type_ids": encoded_text['token_type_ids'].squeeze(),
        "attention_mask": encoded_text['attention_mask'].squeeze(),
    }


def preprocess_images(image):
    image = Image.fromarray(image[0])
    preprocessor = AutoFeatureExtractor.from_pretrained(
        "microsoft/beit-base-patch16-224-pt22k-ft22k")
    processed_images = preprocessor(
        images=[image.convert('RGB')],
        return_tensors="pt",
    )
    return {
        "pixel_values": processed_images['pixel_values'].squeeze(),
    }
    




def answer(input_image, input_question):
    textDict = tokenize_text([input_question])

    imgDict = preprocess_images([input_image])
    
    input_ids = textDict["input_ids"].unsqueeze(0).to(device)
    token_type_ids = textDict["token_type_ids"].unsqueeze(0).to(device)
    attention_mask = textDict["attention_mask"].unsqueeze(0).to(device)
    pixel_values = imgDict["pixel_values"].unsqueeze(0).to(device)
    output = model(input_ids, pixel_values,
                   attention_mask, token_type_ids)
    preds = output["logits"].argmax(axis=-1).cpu().numpy()[0]
    return id_to_answer[preds]

demo = gr.Interface(
    fn=answer,
    inputs=["image", "text"],
    outputs="text",
)
demo.launch()


Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


