In [3]:
## TEST USING OSCAR CHECKPOINT

# load examples with object features (features.tsv file)
import pandas as pd
import os
import ast
import json
import sys
sys.path.append("/fsx/harman/Oscar")
from oscar.modeling.modeling_bert import BertImgForPreTraining
from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertTokenizer)
from transformers import BertTokenizer
import numpy as np
import base64
from tqdm import tqdm
import torch



def get_img_feats(data_dir, file_name='features.tsv'):
    img_feats_df = pd.read_csv(os.path.join(data_dir, file_name),sep='\t',header=None,converters={1:ast.literal_eval},index_col=0)
    img_feat_dict = {}
    for i in range(img_feats_df.shape[0]):
        num_boxes = img_feats_df.iloc[i][1]
        
        features = img_feats_df.iloc[i][2]
        img_feat_dict[img_feats_df.iloc[i].name] = {'num_boxes':num_boxes, 'features':features}
        
    
    return img_feat_dict

def get_img_predictions(data_dir, file_name='predictions.tsv'):
    img_predictions_df = pd.read_csv(os.path.join(data_dir, file_name),sep='\t',header = None,converters={1:json.loads},index_col=0)#converters={1:ast.literal_eval})
    img_predictions_dict = {}
    # print(img_predictions_df.iloc[0][1]['objects'])
    for i in range(img_predictions_df.shape[0]):
        objectinfo = img_predictions_df.iloc[i][1]['objects']
        img_predictions_dict[img_predictions_df.iloc[i].name] = objectinfo
        
    
    return img_predictions_dict

In [4]:
# get_img_predictions(winoroot)

In [5]:
# data_dir = "/fsx/harman/data/VinVL_img_features/winoground"
# file_name = "winoground_test.feature.tsv"
# img_feats_df = pd.read_csv(os.path.join(data_dir, file_name),sep='\t',header=None,converters={1:ast.literal_eval},index_col=0)

In [6]:
# img_feats_df.iloc[0][1]

In [7]:
winoroot = "/fsx/harman/data/VinVL_img_features/winoground"
img_feats = get_img_feats(winoroot, file_name = 'winoground_test.feature.tsv') # dict of the form {imgname: {'num_boxes': 37,'features': 'AAAAAAAAAAAAAAAA}}
img_preds = get_img_predictions(winoroot, file_name = 'predictions.tsv') # dict of the form {imgname: [{'rect': [0.0, 280.7784118652344, 1127.890380859375, 1326.784912109375], 'bbox_id': 0, 'class': 'man', 'conf': 0.9323487877845764, 'feature': '+rRRPwAA}



In [8]:
# img_preds['ex_38_img_1']

In [9]:
imgkeys = list(img_feats.keys())

In [10]:
# img_feats[imgkeys[0]]
# imgkeys[3]

In [11]:
# get the raw json file of winoground
winoground_raw = []
with open('/fsx/harman/data/raw_data/winoground/examples.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
for json_str in json_list:
    result = json.loads(json_str)
    winoground_raw.append(result)

In [12]:
# winoground_raw

In [82]:
## load oscar model
MODEL_CLASSES = {
    'bert': (BertConfig, BertImgForPreTraining, BertTokenizer),
}

# modelpath = '/fsx/harman/Oscar/pretrained_models/vqa/base/checkpoint-2000000'
modelpath = '/fsx/harman/Oscar/pretrained_models/pretrained_base/checkpoint-2000000'
# modelpath = "/checkpoints/harman/oscar/88/checkpoint-0240000"
# modelpath = '/checkpoints/harman/oscar/87/checkpoint-0240000'
# modelpath = '/checkpoints/harman/oscar/120/checkpoint-0300000'
# modelpath = '/checkpoints/harman/oscar/121/checkpoint-0300000'
,
# model_paths = ['/fsx/harman/Oscar/pretrained_models/pretrained_base/checkpoint-2000000', '/checkpoints/harman/oscar/120/checkpoint-0300000', '/checkpoints/harman/oscar/121/checkpoint-0300000']
model_paths = ['/checkpoints/harman/oscar/87/checkpoint-0240000', '/checkpoints/harman/oscar/88/checkpoint-0240000']

use_probs = [0, 2]
# use_prob = 0
# use_prob = 2

for modelpath in model_paths:
    for use_prob in use_probs:
        print(f'modelpath = {modelpath}')
        print(f'use_prob = {use_prob}')
        config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
        config = config_class.from_pretrained(
                modelpath,
            )

        config.img_layer_norm_eps = 1e-12
        config.use_img_layernorm = 1
        config.img_feature_dim = 2054
        config.img_feature_type = "faster_r-cnn"
        config.hidden_dropout_prob = 0.3
        config.num_contrast_classes = 3
        config.output_hidden_states = True
        config.obj_relation_vocab_size = 51
        config.use_sg = False

        model = BertImgForPreTraining.from_pretrained(
                modelpath,
                config=config, ignore_mismatched_sizes=True)
        model = model.cuda()
        model.eval()
        # model.eva0()

        tokenizer = BertTokenizer.from_pretrained(
                modelpath,
                do_lower_case=True)

        # winoground_raw[0]

        def _truncate_seq_pair(tokens_a, tokens_b, max_length):
            """Truncates a sequence pair in place to the maximum length."""

            # This is a simple heuristic which will always truncate the longer sequence
            # one token at a time. This makes more sense than truncating an equal percent
            # of tokens from each, since if one sequence is very short then each token
            # that's truncated likely contains more information than a longer sequence.
            while True:
                total_length = len(tokens_a) + len(tokens_b)
                if total_length <= max_length:
                    break
                if len(tokens_a) > len(tokens_b):
                    tokens_a.pop()
                else:
                    tokens_b.pop()

        def oscar_processor(texta, textb, img_feat, max_seq_length=35, max_img_seq_length=50):
            tokens_a = tokenizer.tokenize(texta)
            tokens_b = tokenizer.tokenize(textb)

            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
            assert len(tokens_a) + len(tokens_b) <= max_seq_length


            tokens = []
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in tokens_a:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            assert len(tokens_b) > 0
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)


            if img_feat.shape[0] >= max_img_seq_length:
                    img_feat = img_feat[0:max_img_seq_length, ] # truncating the img features to be 50 length max!
                    img_feat_len = img_feat.shape[0]
            else:
                img_feat_len = img_feat.shape[0]
                padding_matrix = torch.zeros((max_img_seq_length - img_feat.shape[0], img_feat.shape[1]))
                img_feat = torch.cat((img_feat, padding_matrix), 0)

            if max_img_seq_length > 0:
                if img_feat_len > max_img_seq_length:
                    input_mask = input_mask + [1] * img_feat_len
                else:
                    input_mask = input_mask + [1] * img_feat_len
                    pad_img_feat_len = max_img_seq_length - img_feat_len
                    input_mask = input_mask + ([0] * pad_img_feat_len)

            images = torch.stack([img_feat]).to(device = torch.device("cuda"), non_blocking=True)
            input_ids = torch.stack([torch.tensor(input_ids, dtype=torch.long)]).to(device = torch.device("cuda"), non_blocking=True)
            input_mask = torch.stack([torch.tensor(input_mask, dtype=torch.long)]).to(device = torch.device("cuda"), non_blocking=True)
            segment_ids = torch.stack([torch.tensor(segment_ids, dtype=torch.long)]).to(device = torch.device("cuda"), non_blocking=True)

            # print(images.shape, input_ids.shape, input_mask.shape, segment_ids.shape)
            return input_ids, segment_ids, input_mask, images


        # get scores using the contrastive thing, see how that is calculated in the bert model of oscar
        winoground_oscar_contrastive_scores = []
        img_feature_dim = 2054
        for example in tqdm(winoground_raw):
            img0objtags = ' '.join([cur_d['class'] for cur_d in img_preds[example['image_0']]])
            img1objtags = ' '.join([cur_d['class'] for cur_d in img_preds[example['image_1']]])
            img0featstr, img0numobj = img_feats[example['image_0']]['features'], img_feats[example['image_0']]['num_boxes'] 
            img1featstr, img1numobj = img_feats[example['image_1']]['features'], img_feats[example['image_1']]['num_boxes'] 
            img0feat = np.frombuffer(base64.b64decode(img0featstr), dtype=np.float32).reshape((img0numobj, img_feature_dim))
            img1feat = np.frombuffer(base64.b64decode(img1featstr), dtype=np.float32).reshape((img1numobj, img_feature_dim))
            img0feat = torch.from_numpy(np.array(img0feat))
            img1feat = torch.from_numpy(np.array(img1feat))



            inputs_c0_i0 = oscar_processor(texta=example["caption_0"], textb=img0objtags, img_feat=img0feat)
            inputs_c1_i0 = oscar_processor(texta=example["caption_1"], textb=img0objtags, img_feat=img0feat)
            inputs_c0_i1 = oscar_processor(texta=example["caption_0"], textb=img1objtags, img_feat=img1feat)
            inputs_c1_i1 = oscar_processor(texta=example["caption_1"], textb=img1objtags, img_feat=img1feat)

            outputs_c0_i0 = model(inputs_c0_i0[0], inputs_c0_i0[1], inputs_c0_i0[2], img_feats=inputs_c0_i0[3])
            outputs_c1_i0 = model(inputs_c1_i0[0], inputs_c1_i0[1], inputs_c1_i0[2], img_feats=inputs_c1_i0[3])
            outputs_c0_i1 = model(inputs_c0_i1[0], inputs_c0_i1[1], inputs_c0_i1[2], img_feats=inputs_c0_i1[3])
            outputs_c1_i1 = model(inputs_c1_i1[0], inputs_c1_i1[1], inputs_c1_i1[2], img_feats=inputs_c1_i1[3])

            # print(outputs_c0_i0)

            # oscar_contrastive_scores_c0_i0 = outputs_c0_i0.contrastive_logits_per_image.item()
            # oscar_contrastive_scores_c1_i0 = outputs_c1_i0.contrastive_logits_per_image.item()
            # oscar_contrastive_scores_c0_i1 = outputs_c0_i1.contrastive_logits_per_image.item()
            # oscar_contrastive_scores_c1_i1 = outputs_c1_i1.contrastive_logits_per_image.item()
            # winoground_oscar_contrastive_scores.append({"id" : example["id"], "c0_i0": oscar_contrastive_scores_c0_i0, "c0_i1": oscar_contrastive_scores_c0_i1, "c1_i0": oscar_contrastive_scores_c1_i0, "c1_i1": oscar_contrastive_scores_c1_i1})

            # print(outputs_c0_i0[1].view(3)[0].item())
            oscar_contrastive_scores_c0_i0 = outputs_c0_i0[1].view(3)[use_prob].item()
            oscar_contrastive_scores_c1_i0 = outputs_c1_i0[1].view(3)[use_prob].item()
            oscar_contrastive_scores_c0_i1 = outputs_c0_i1[1].view(3)[use_prob].item()
            oscar_contrastive_scores_c1_i1 = outputs_c1_i1[1].view(3)[use_prob].item()
            winoground_oscar_contrastive_scores.append({"id" : example["id"], "c0_i0": oscar_contrastive_scores_c0_i0, "c0_i1": oscar_contrastive_scores_c0_i1, "c1_i0": oscar_contrastive_scores_c1_i0, "c1_i1": oscar_contrastive_scores_c1_i1})

        # calculate scores
        if use_prob == 2:
            def text_correct(result):
                return result["c0_i0"] < result["c1_i0"] and result["c1_i1"] < result["c0_i1"]

            def image_correct(result):
                return result["c0_i0"] < result["c0_i1"] and result["c1_i1"] < result["c1_i0"]

            def group_correct(result):
                return image_correct(result) and text_correct(result)

        elif use_prob ==0:
            def text_correct(result):
                return result["c0_i0"] > result["c1_i0"] and result["c1_i1"] > result["c0_i1"]

            def image_correct(result):
                return result["c0_i0"] > result["c0_i1"] and result["c1_i1"] > result["c1_i0"]

            def group_correct(result):
                return image_correct(result) and text_correct(result)
        else:
            raise NotImplementedError

        contrastive_text_correct_count = 0
        contrastive_image_correct_count = 0
        contrastive_group_correct_count = 0
        for result in winoground_oscar_contrastive_scores:
          contrastive_text_correct_count += 1 if text_correct(result) else 0
          contrastive_image_correct_count += 1 if image_correct(result) else 0
          contrastive_group_correct_count += 1 if group_correct(result) else 0

        denominator = len(winoground_oscar_contrastive_scores)
        print("contrastive text score:", contrastive_text_correct_count/denominator)
        print("contrastive image score:", contrastive_image_correct_count/denominator)
        print("contrastive group score:", contrastive_group_correct_count/denominator)


modelpath = /checkpoints/harman/oscar/87/checkpoint-0240000
use_prob = 0


Some weights of the model checkpoint at /checkpoints/harman/oscar/87/checkpoint-0240000 were not used when initializing BertImgForPreTraining: ['cls.obj_relation.weight', 'cls.obj_relation.bias']
- This IS expected if you are initializing BertImgForPreTraining from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertImgForPreTraining from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:18<00:00, 21.74it/s]


contrastive text score: 0.23
contrastive image score: 0.0725
contrastive group score: 0.0525
modelpath = /checkpoints/harman/oscar/87/checkpoint-0240000
use_prob = 2


Some weights of the model checkpoint at /checkpoints/harman/oscar/87/checkpoint-0240000 were not used when initializing BertImgForPreTraining: ['cls.obj_relation.weight', 'cls.obj_relation.bias']
- This IS expected if you are initializing BertImgForPreTraining from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertImgForPreTraining from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:18<00:00, 21.81it/s]


contrastive text score: 0.235
contrastive image score: 0.065
contrastive group score: 0.0475
modelpath = /checkpoints/harman/oscar/88/checkpoint-0240000
use_prob = 0


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:18<00:00, 22.05it/s]


contrastive text score: 0.2775
contrastive image score: 0.0925
contrastive group score: 0.045
modelpath = /checkpoints/harman/oscar/88/checkpoint-0240000
use_prob = 2


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:18<00:00, 21.91it/s]

contrastive text score: 0.28
contrastive image score: 0.0975
contrastive group score: 0.0675



