In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.4 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 20.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempti

In [2]:
import argparse
from dataclasses import dataclass
import json
import logging
import os
import random
import sys
import time
import warnings
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import transformers
#import wandb
from tqdm.auto import tqdm, trange


logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s: %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

In [3]:
def get_data(file_path, sample, num_choices):
    """ Reads jsonl file (download links in readme) """
    data_file = open(file_path, "r")
    logger.info("Reading QA instances from jsonl dataset at: %s", file_path)
    item_jsons = []
    item_ids = []
    questions = []
    choice_lists = []
    answer_ids = []
    for line in data_file:
        item_jsons.append(json.loads(line.strip()))

    if sample != -1:
        item_jsons = random.sample(item_jsons, sample)
        logger.info("Sampling %d examples", sample)

    for item_json in tqdm(item_jsons,total=len(item_jsons)):
        item_id = item_json["id"]

        question_text = item_json["question"]["stem"]

        choice_label_to_id = {}
        choice_text_list = []
        choice_context_list = []
        choice_label_list = []
        choice_annotations_list = []

        any_correct = False
        choice_id_correction = 0

        for choice_id, choice_item in enumerate(item_json["question"]["choices"]):
            choice_label = choice_item["label"]
            choice_label_to_id[choice_label] = choice_id - choice_id_correction
            choice_text = choice_item["text"]

            choice_text_list.append(choice_text)
            choice_label_list.append(choice_label)

            if item_json.get('answerKey') == choice_label:
                if any_correct:
                    raise ValueError("More than one correct answer found for {item_json}!")
                any_correct = True


        if not any_correct and 'answerKey' in item_json:
            raise ValueError("No correct answer found for {item_json}!")


        answer_id = choice_label_to_id.get(item_json.get("answerKey"))
        # Pad choices with empty strings if not right number
        if len(choice_text_list) != num_choices:
            choice_text_list = (choice_text_list + num_choices * [''])[:num_choices]
            choice_context_list = (choice_context_list + num_choices * [None])[:num_choices]
            if answer_id is not None and answer_id >= num_choices:
                logging.warning(f"Skipping question with more than {num_choices} answers: {item_json}")
                continue

        item_ids.append(item_id)
        questions.append(question_text)
        choice_lists.append(choice_text_list)
        answer_ids.append(answer_id)

    data_file.close()
    return questions, choice_lists, answer_ids

In [4]:
@dataclass
class CustomArguments(transformers.TrainingArguments):
    sample_train: int = 0
    sample_eval: int = 0
    num_choices: int = 0
    model_name_or_path: str = "asdf"  # this is no longer a TrainingArgument attribute
        
    # python dataclasses cannot have positional attributes in subclass,
    # so give all attributes defaults and then make sure they are changed
    def __post_init__(self):
        if not (self.sample_train * self.sample_eval * self.num_choices) or \
               self.model_name_or_path == "asdf":  # make sure none are still default value
            raise TypeError("__init__ missing required argument(s)")

def get_args():
    """ Set hyperparameters """
    args = CustomArguments(
        output_dir="checkpoint",
        model_name_or_path="gpt2",
        overwrite_output_dir=True,
        do_train=False,  # Zero shot
        do_eval=True,
        per_device_eval_batch_size=2,
        learning_rate=1e-5,  # Should not matter because not training
        weight_decay=0.1,
        save_total_limit=2,
        seed=123,
        sample_train=200,
        sample_eval=-1,
        num_choices=2,
    )
    
    return args

In [5]:
class BERTDataset(Dataset):  # Only difference is that BERTDataset has token_type_ids while RoBERTaDataset doesn't
    
    def __init__(self, questions, choices, answer_ids, tokenizer):
        out = tokenizer(questions)
        self.input_ids = out["input_ids"]
        self.token_type_ids = out["token_type_ids"]
        self.attention_mask = out["attention_mask"]
        self.questions = questions
        self.choices = choices
        self.answer_ids = answer_ids
        
    def __len__(self):
        return len(self.questions)

    def __getitem__(self, i):
        return {
            "input_ids": self.input_ids[i], 
            "attention_mask": self.attention_mask[i], 
            "token_type_ids": self.token_type_ids[i],
            "choice_list": self.choices[i], 
            "answer_id": self.answer_ids[i],
        }
    

class RoBERTaDataset(Dataset):
    
    def __init__(self, questions, choices, answer_ids, tokenizer):
        if any(prefix in args.model_name_or_path.lower() for prefix in ("roberta", "bart")):
            questions = [question.replace('[MASK]','<mask>') for question in questions]
        out = tokenizer(questions, max_length=25, padding="max_length")
        self.input_ids = out["input_ids"]
        self.attention_mask = out["attention_mask"]
        self.questions = questions
        self.choices = choices
        self.answer_ids = answer_ids
        
    def __len__(self):
        return len(self.questions)

    def __getitem__(self, i):
        return {
            "input_ids": self.input_ids[i], 
            "attention_mask": self.attention_mask[i], 
            "choice_list": self.choices[i], 
            "answer_id": self.answer_ids[i],
        }

In [28]:
def get_sentence_prob(input_ids, logits):
    # Multiplies together individual probabilities to get overall sentence probability
    logits = torch.nn.functional.softmax(logits, dim=2)
    probs = torch.gather(logits, 2, input_ids.unsqueeze(-1))
    probs = probs.squeeze(-1)
    probs = probs * 1e4  # product is zero otherwise
    probs = torch.sum(torch.log(probs), dim=1)
    return probs

In [29]:
def evaluate_taskwithmask(args, model, tokenizer, eval_dataset, data_path):
    """ 
    Evaluates gpt2 on the masked task dataset 
    """
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.per_device_eval_batch_size)

    logger.info(f"***** Running evaluation  *****")
    logger.info(f"  Num examples = {len(eval_dataset)}")
    logger.info(f"  Batch size = {args.eval_batch_size}")
    eval_dataloader = tqdm(eval_dataloader, desc="Evaluating")
    
    #encoding all the labels
    label_dict = {}
    label_encodings = {}
    list_of_labels = eval_dataset[0]['choice_list']
    i=0
    for label in list_of_labels:
      label_dict[label] = i
      label1 = " "+ label
      label_encodings[label1] = tokenizer.encode(label1, add_special_tokens=False)[0]
      i+=1

    label_id_encoding_map = dict(zip(label_dict.values(),label_encodings.values()))

    all_answers = []
    all_preds = []
    first_age = []
    second_age = []
    first_object = []
    second_object = []
    
    #create list of true answers =  all_answers 
    for batch in eval_dataloader:
        original_batch = batch          
        model.eval()
        for i in range(len(batch["answer_id"])):
            true_label_id = batch["answer_id"][i]
            actual_label = batch["choice_list"][true_label_id][i]
            label_id_to_append = label_dict[actual_label]
            all_answers.append(label_id_to_append)
           

        del batch["choice_list"] 
        for key in batch:
            if key != "answer_id":
                batch[key] = torch.stack(batch[key], dim=-1)

            batch[key] = batch[key].cuda()
      
        
        answer_ids = batch.pop("answer_id")
        label_encoding_list = list(label_encodings.values())
        no_of_labels = len(label_encoding_list)

        #create the list for age1, age2 for age task and list of objects for object comparison task
        if data_path == "data/number_comparison_age_compare_masked_dev.jsonl":
            age1 = tokenizer.decode(batch["input_ids"][:, 1]).split(" ")
            age2 = tokenizer.decode(batch["input_ids"][:, 11]).split(" ")
            age1 = age1[1:]
            age2 = age2[1:]
            first_age.extend(age1)
            second_age.extend(age2)


        with torch.no_grad():
            #generate probablities for all the labels
            
            list_of_mask_index = []
           
            for i in range(len(batch["input_ids"])):
                  question = batch["input_ids"][i]
                  MASK_INDEX = (question==tokenizer.mask_token_id).nonzero().item()
                  batch["input_ids"][i, MASK_INDEX] =  label_id_encoding_map[0]
                  list_of_mask_index.append(MASK_INDEX)
            
            outputs = model(**batch)
            logits = outputs.logits
            id0_prob = get_sentence_prob(batch["input_ids"], logits)
           
            m=0
            for i in range(len(batch["input_ids"])):
                question = batch["input_ids"][i]
                MASK_INDEX = list_of_mask_index[m]
                batch["input_ids"][i, MASK_INDEX] =  label_id_encoding_map[1]
                m +=1
      
            outputs = model(**batch)
            logits = outputs.logits
            id1_prob = get_sentence_prob(batch["input_ids"], logits)
            
            if no_of_labels==3:
              m=0
              for i in range(len(batch["input_ids"])):
                  question = batch["input_ids"][i]
                  MASK_INDEX = list_of_mask_index[m]
                  batch["input_ids"][i, MASK_INDEX] =  label_id_encoding_map[2]
                  m +=1
                  
              outputs = model(**batch)
              logits = outputs.logits
              id2_prob = get_sentence_prob(batch["input_ids"], logits)

            if no_of_labels==5:
                m=0
                for i in range(len(batch["input_ids"])):
                    question = batch["input_ids"][i]
                    MASK_INDEX = list_of_mask_index[m]
                    batch["input_ids"][i, MASK_INDEX] =  label_id_encoding_map[2]
                    m +=1
                
                outputs = model(**batch)
                logits = outputs.logits
                id2_prob = get_sentence_prob(batch["input_ids"], logits)

                m=0
                for i in range(len(batch["input_ids"])):
                    question = batch["input_ids"][i]
                    MASK_INDEX = list_of_mask_index[m]
                    batch["input_ids"][i, MASK_INDEX] =  label_id_encoding_map[3]
                    m +=1
                outputs = model(**batch)
                logits = outputs.logits
                id3_prob = get_sentence_prob(batch["input_ids"], logits)

                m=0
                for i in range(len(batch["input_ids"])):
                    question = batch["input_ids"][i]
                    MASK_INDEX = list_of_mask_index[m]
                    batch["input_ids"][i, MASK_INDEX] =  label_id_encoding_map[4]
                    m +=1
                  
                outputs = model(**batch)
                logits = outputs.logits
                id4_prob = get_sentence_prob(batch["input_ids"], logits)
  
        batch_size = len(batch["input_ids"])
        #create all_preds
        if no_of_labels ==2:
          test_pred = torch.gt(id0_prob, id0_prob)
          id0_prob = torch.reshape(id0_prob, (batch_size, 1))
          id1_prob = torch.reshape(id1_prob, (batch_size, 1))
          combine_prob = torch.cat((id0_prob, id1_prob), dim=1)
          preds = list(torch.argmax(combine_prob, dim=1))
          all_preds.extend(preds)
        elif no_of_labels ==3:
          id0_prob = torch.reshape(id0_prob, (batch_size, 1))
          id1_prob = torch.reshape(id1_prob, (batch_size, 1))
          id2_prob = torch.reshape(id2_prob, (batch_size, 1))
          combine_prob = torch.cat((id0_prob, id1_prob, id2_prob), dim=1)
          preds = list(torch.argmax(combine_prob, dim=1))
          all_preds.extend(preds)
        elif no_of_labels ==5:
          id0_prob = torch.reshape(id0_prob, (batch_size, 1))
          id1_prob = torch.reshape(id1_prob,(batch_size, 1))
          id2_prob = torch.reshape(id2_prob, (batch_size, 1))
          id3_prob = torch.reshape(id3_prob, (batch_size, 1))
          id4_prob = torch.reshape(id4_prob, (batch_size, 1))
          combine_prob = torch.cat((id0_prob, id1_prob, id2_prob, id3_prob, id4_prob), dim=1)
          preds = list(torch.argmax(combine_prob, dim=1))
          all_preds.extend(preds)
    if data_path == "data/number_comparison_age_compare_masked_dev.jsonl":
      first_age = [int(age) for age in first_age]
      second_age = [int(age) for age in second_age]
      return all_answers, all_preds, first_age, second_age
    return all_answers, all_preds

In [20]:
!wget https://olmpics.s3.us-east-2.amazonaws.com/challenge/number_comparison/number_comparison_age_compare_masked_dev.jsonl.gz
!wget https://olmpics.s3.us-east-2.amazonaws.com/challenge/antonym_synonym_negation/antonym_synonym_negation_dev.jsonl.gz 
!wget https://olmpics.s3.us-east-2.amazonaws.com/challenge/size_comparison/size_comparison_dev.jsonl.gz
!wget https://olmpics.s3.us-east-2.amazonaws.com/challenge/compositional_comparison/compositional_comparison_dev.jsonl.gz
!wget https://olmpics.s3.us-east-2.amazonaws.com/challenge/coffee_cats_quantifiers/coffee_cats_quantifiers_dev.jsonl.gz


--2021-10-18 13:27:30--  https://olmpics.s3.us-east-2.amazonaws.com/challenge/number_comparison/number_comparison_age_compare_masked_dev.jsonl.gz
Resolving olmpics.s3.us-east-2.amazonaws.com (olmpics.s3.us-east-2.amazonaws.com)... 52.219.142.58
Connecting to olmpics.s3.us-east-2.amazonaws.com (olmpics.s3.us-east-2.amazonaws.com)|52.219.142.58|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12544 (12K) [binary/octet-stream]
Saving to: ‘number_comparison_age_compare_masked_dev.jsonl.gz’


2021-10-18 13:27:30 (129 MB/s) - ‘number_comparison_age_compare_masked_dev.jsonl.gz’ saved [12544/12544]

--2021-10-18 13:27:31--  https://olmpics.s3.us-east-2.amazonaws.com/challenge/antonym_synonym_negation/antonym_synonym_negation_dev.jsonl.gz
Resolving olmpics.s3.us-east-2.amazonaws.com (olmpics.s3.us-east-2.amazonaws.com)... 52.219.142.58
Connecting to olmpics.s3.us-east-2.amazonaws.com (olmpics.s3.us-east-2.amazonaws.com)|52.219.142.58|:443... connected.
HTTP request sent

In [21]:
!gunzip number_comparison_age_compare_masked_dev.jsonl.gz
!gunzip antonym_synonym_negation_dev.jsonl.gz
!gunzip size_comparison_dev.jsonl.gz
!gunzip compositional_comparison_dev.jsonl.gz
!gunzip coffee_cats_quantifiers_dev.jsonl.gz

gzip: number_comparison_age_compare_masked_dev.jsonl already exists; do you wish to overwrite (y or n)? ^C
gzip: antonym_synonym_negation_dev.jsonl already exists; do you wish to overwrite (y or n)? ^C
gzip: size_comparison_dev.jsonl already exists; do you wish to overwrite (y or n)? ^C
gzip: compositional_comparison_dev.jsonl already exists; do you wish to overwrite (y or n)? n
	not overwritten
gzip: coffee_cats_quantifiers_dev.jsonl already exists; do you wish to overwrite (y or n)? n
	not overwritten


In [22]:
args = get_args()
transformers.set_seed(args.seed)

'''
models = "gpt" , "gpt-medium", "gpt2-large"
data  = "number_comparison_age_compare_masked_dev.jsonl" , args.num_choices = 2
data  = "antonym_synonym_negation_dev.jsonl" , args.num_choices = 2
data  = "size_comparison_dev.jsonl" , args.num_choices = 2
data  = "compositional_comparison_dev.jsonl" , args.num_choices = 3
data  = "coffee_cats_quantifiers_dev.jsonl" , args.num_choices = 5
'''

args.num_choices = 2
args.model_name_or_path = 'gpt2'
data_path = "number_comparison_age_compare_masked_dev.jsonl"
data=data_path
#train_questions, train_choices, train_answer_ids = get_data(, args.sample_train, args.num_choices)
eval_questions, eval_choices, eval_answer_ids = get_data(data, args.sample_eval, args.num_choices)


10/18/2021 13:27:49: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

In [13]:
model = transformers.AutoModelWithLMHead.from_pretrained(args.model_name_or_path).cuda()
tokenizer = transformers.AutoTokenizer.from_pretrained(args.model_name_or_path , mask_token = '[MASK]')
tokenizer.pad_token = tokenizer.eos_token

10/18/2021 13:23:47: Attempting to acquire lock 140503643681744 on /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51.lock
10/18/2021 13:23:47: Lock 140503643681744 acquired on /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51.lock


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

10/18/2021 13:23:48: Attempting to release lock 140503643681744 on /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51.lock
10/18/2021 13:23:48: Lock 140503643681744 released on /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51.lock
10/18/2021 13:23:48: Attempting to acquire lock 140503618496144 on /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925.lock
10/18/2021 13:23:48: Lock 140503618496144 acquired on /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925.lock


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

10/18/2021 13:24:07: Attempting to release lock 140503618496144 on /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925.lock
10/18/2021 13:24:07: Lock 140503618496144 released on /root/.cache/huggingface/transformers/752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925.lock
10/18/2021 13:24:29: Attempting to acquire lock 140503584900048 on /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock
10/18/2021 13:24:29: Lock 140503584900048 acquired on /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock


Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

10/18/2021 13:24:30: Attempting to release lock 140503584900048 on /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock
10/18/2021 13:24:30: Lock 140503584900048 released on /root/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock
10/18/2021 13:24:30: Attempting to acquire lock 140503584878224 on /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
10/18/2021 13:24:30: Lock 140503584878224 acquired on /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

10/18/2021 13:24:30: Attempting to release lock 140503584878224 on /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
10/18/2021 13:24:30: Lock 140503584878224 released on /root/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock
10/18/2021 13:24:30: Attempting to acquire lock 140503584876240 on /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock
10/18/2021 13:24:30: Lock 140503584876240 acquired on /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock


Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

10/18/2021 13:24:31: Attempting to release lock 140503584876240 on /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock
10/18/2021 13:24:31: Lock 140503584876240 released on /root/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0.lock
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
AgeDataset = RoBERTaDataset if any(prefix in args.model_name_or_path.lower() for prefix in ("roberta", "bart", "distil", "gpt")) else BERTDataset
#train_dataset = AgeDataset(train_questions, train_choices, train_answer_ids, tokenizer)
eval_dataset = AgeDataset(eval_questions, eval_choices, eval_answer_ids, tokenizer)

In [31]:
all_answers, all_preds = evaluate_taskwithmask(args, model, tokenizer, eval_dataset, data_path)

10/18/2021 13:39:30: ***** Running evaluation  *****
10/18/2021 13:39:30:   Num examples = 500
10/18/2021 13:39:30:   Batch size = 2


Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

In [32]:
#eval_dataset[12]

In [33]:
#choose random 80% of the dataset and run evaluation for 5 cycles
accuracy = []
for i in range(5):
  eval_questions, eval_choices, eval_answer_ids = get_data(data, args.sample_eval, args.num_choices)
  combined_dataset = {'que': eval_questions, 'choices': eval_choices, 'ids': eval_answer_ids, }
  combined_dataset = pd.DataFrame(data=combined_dataset)
  sampled_dataset = combined_dataset.sample(frac = 0.8)
  eval_questions = list(sampled_dataset['que'])
  eval_choices = list(sampled_dataset['choices'])
  eval_answer_ids = list(sampled_dataset['ids'])

  eval_dataset = AgeDataset(eval_questions, eval_choices, eval_answer_ids, tokenizer)
  all_answers, all_preds = evaluate_taskwithmask(args, model, tokenizer, eval_dataset, data_path)
  a = 0
  b = 0
  for i in range(len(all_answers)):
    if all_preds[i] != -1:
        b += 1
        if all_preds[i] == all_answers[i]:
            a += 1
  current_acc = a/b
  accuracy.append(current_acc)


10/18/2021 13:39:41: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:39:41: ***** Running evaluation  *****
10/18/2021 13:39:41:   Num examples = 400
10/18/2021 13:39:41:   Batch size = 2


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:39:49: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:39:49: ***** Running evaluation  *****
10/18/2021 13:39:49:   Num examples = 400
10/18/2021 13:39:49:   Batch size = 2


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:39:57: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:39:57: ***** Running evaluation  *****
10/18/2021 13:39:57:   Num examples = 400
10/18/2021 13:39:57:   Batch size = 2


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:40:06: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:40:06: ***** Running evaluation  *****
10/18/2021 13:40:06:   Num examples = 400
10/18/2021 13:40:06:   Batch size = 2


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:40:14: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:40:14: ***** Running evaluation  *****
10/18/2021 13:40:14:   Num examples = 400
10/18/2021 13:40:14:   Batch size = 2


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

In [34]:
accuracy

[0.5125, 0.5125, 0.52, 0.4925, 0.51]

In [35]:
import numpy as np
import scipy.stats as st
#Number of samples = 5
#create 95% confidence interval for population mean weight
mini, maxi = st.t.interval(alpha=0.95, df=len(accuracy)-1, loc=np.mean(accuracy), scale=st.sem(accuracy))
accuracy = np.array(accuracy)

In [36]:
print("The accuracy is of {} for {} task is {} +- {}".format(args.model_name_or_path, data_path, accuracy.mean()*100 ,-1*accuracy.mean()*100+maxi*100))

The accuracy is of gpt2 for number_comparison_age_compare_masked_dev.jsonl task is 50.95 +- 1.2685346465033618


In [37]:
#sequentially partitioned the dataset into 5 parts and run evaluation
accuracy = []
for i in range(5):
  eval_questions, eval_choices, eval_answer_ids = get_data(data, args.sample_eval, args.num_choices)
  total_items = len(eval_questions)
  n = int(total_items/5)
  if i==0:
    eval_questions = eval_questions[n:]
    eval_choices = eval_choices[n:]
    eval_answer_ids = eval_answer_ids[n:] 
  elif i==4:
    eval_questions = eval_questions[:4*n]
    eval_choices = eval_choices[:4*n]
    eval_answer_ids = eval_answer_ids[:4*n]
    
  else:
    eval_questions = eval_questions[:i*n] + eval_questions[(i+1)*n:]
    eval_choices = eval_choices[:i*n] + eval_choices[(i+1)*n:]
    eval_answer_ids = eval_answer_ids[:i*n] + eval_answer_ids[(i+1)*n:]   

  eval_dataset = AgeDataset(eval_questions, eval_choices, eval_answer_ids, tokenizer)
  print(len(eval_dataset))
  all_answers, all_preds = evaluate_taskwithmask(args, model, tokenizer, eval_dataset, data_path)
  a = 0
  b = 0
  for i in range(len(all_answers)):
    if all_preds[i] != -1:
        b += 1
        if all_preds[i] == all_answers[i]:
            a += 1
  current_acc = a/b
  accuracy.append(current_acc)

10/18/2021 13:40:23: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:40:23: ***** Running evaluation  *****
10/18/2021 13:40:23:   Num examples = 400
10/18/2021 13:40:23:   Batch size = 2


400


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:40:31: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:40:31: ***** Running evaluation  *****
10/18/2021 13:40:31:   Num examples = 400
10/18/2021 13:40:31:   Batch size = 2


400


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:40:40: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:40:40: ***** Running evaluation  *****
10/18/2021 13:40:40:   Num examples = 400
10/18/2021 13:40:40:   Batch size = 2


400


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:40:48: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:40:48: ***** Running evaluation  *****
10/18/2021 13:40:48:   Num examples = 400
10/18/2021 13:40:48:   Batch size = 2


400


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

10/18/2021 13:40:56: Reading QA instances from jsonl dataset at: number_comparison_age_compare_masked_dev.jsonl


  0%|          | 0/500 [00:00<?, ?it/s]

10/18/2021 13:40:57: ***** Running evaluation  *****
10/18/2021 13:40:57:   Num examples = 400
10/18/2021 13:40:57:   Batch size = 2


400


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

In [38]:
import numpy as np
import scipy.stats as st

#define sample data
data = accuracy
print(accuracy)

#create 95% confidence interval for population mean weight
mini, maxi = st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data))
accuracy = np.array(accuracy)

[0.4075, 0.4525, 0.515, 0.5475, 0.6075]


In [39]:
print("The accuracy is of {} for {} task is {} +- {}".format(args.model_name_or_path, data_path, accuracy.mean()*100 ,-1*accuracy.mean()*100+maxi*100))

The accuracy is of gpt2 for number_comparison_age_compare_masked_dev.jsonl task is 50.6 +- 9.751213563067473


In [40]:
#gpt2

#Always-never
#coffee_cats_quantifiers_dev.jsonl
#80% - [0.48660714285714285, 0.4955357142857143, 0.5089285714285714, 0.5178571428571429, 0.4955357142857143]
#80% - 50.08928571428572 +- 1.5381597165854188
#seq - [0.5, 0.5223214285714286, 0.49107142857142855, 0.4732142857142857, 0.5133928571428571]
#seq - 50.0 +- 2.3841958894922826

#objectcomparison
#size_comparison_dev.jsonl
#80% - [0.4975, 0.4975, 0.51, 0.4925, 0.51]
#80% - 50.14999999999999 +- 0.9962370980872279
#seq - [0.49, 0.5, 0.5075, 0.505, 0.5275]
#seq - 50.6 +- 1.7086983967773435

#antonym_synonym_negation_dev.jsonl
#80% - [0.54, 0.515, 0.5325, 0.5075, 0.5425]
#80% - 52.75000000000001 +- 1.9260806244606172
#seq - [0.5375, 0.5475, 0.5225, 0.515, 0.5275]
#seq - 53.0 +- 1.5828172390416597


#compositional_comparison_dev.jsonl
#80% - [0.335, 0.3, 0.325, 0.3425, 0.3025]
#80% - 32.1 +- 2.3701638780707484
#seq - [0.3225, 0.335, 0.3, 0.3275, 0.325]
#seq - 32.2 +- 1.6337439119790815


#number_comparison_age_compare_masked_dev.jsonl
#80% - [0.5125, 0.5125, 0.52, 0.4925, 0.51]
#80% - 50.95 +- 1.2685346465033618
#seq - [0.4075, 0.4525, 0.515, 0.5475, 0.6075]
#seq - 50.6 +- 9.751213563067473
50.6




50.6

In [41]:
#gpt2-medium
#coffee_cats_quantifiers_dev.jsonl
#80% - [0.41964285714285715, 0.3794642857142857, 0.42410714285714285, 0.4017857142857143, 0.41517857142857145]
#80% - 40.80357142857143 +- 2.237947390191387
#seq - [0.41964285714285715, 0.4330357142857143, 0.40625, 0.4017857142857143, 0.4107142857142857]
#seq - 41.42857142857142 +- 1.538159716585426

#size_comparison_dev.jsonl
#80% - [0.5, 0.5, 0.4875, 0.505, 0.49]
#80% -  49.65 +- 0.9208426664960001
#seq - [0.51, 0.4975, 0.49, 0.4925, 0.47]
#seq - 49.2 +- 1.8020176820974783


#antonym_synonym_negation_dev.jsonl
#80% - [0.5475, 0.565, 0.5375, 0.565, 0.52]
#80% - 54.699999999999996 +- 2.378280901412239
#seq - [0.5225, 0.5325, 0.5575, 0.57, 0.5675]
#seq - 54.99999999999999 +- 2.6521954627697326

#compositional_comparison_dev.jsonl
#80% - [0.3025, 0.275, 0.3025, 0.3175, 0.2825]
#80% -  29.599999999999998 +- 2.1235721654734157
#seq - [0.3, 0.305, 0.285, 0.2975, 0.2925]
#seq - 29.599999999999998 +- 0.9466415704098665

#number_comparison_age_compare_masked_dev.jsonl
#80% - [0.49, 0.4775, 0.49, 0.495, 0.48]
#80% - 48.650000000000006 +- 0.920842666495993
#seq - [0.5275, 0.485, 0.48, 0.485, 0.4625]
#seq - 48.8 +- 2.972547242294091





In [42]:
#gpt2-large

#coffee_cats_quantifiers_dev.jsonl
#80% - [0.19196428571428573,  0.19642857142857142,  0.21875,  0.21428571428571427,  0.1875]
#80% - 20.17857142857143 +- 1.7264020292626014
#seq - [0.19196428571428573, 0.20535714285714285, 0.22321428571428573, 0.20535714285714285, 0.20982142857142858]
#seq - 20.714285714285715 +- 1.391317812969067

#size_comparison_dev.jsonl
#80% - [0.5025, 0.5125, 0.5025, 0.51, 0.4925]
#80% - 50.4 +- 0.9717557868192301
#seq - 
#seq - 

#antonym_synonym_negation_dev.jsonl
#80% - [0.4875, 0.52, 0.4925, 0.5275, 0.4775]
#80% - 50.1 +- 2.6846946455296035
#seq - 
#seq - 

#compositional_comparison_dev.jsonl
#80% - [0.3325, 0.34, 0.345, 0.3175, 0.34]
#80% - 33.5 +- 1.3351496981156785
#seq - 
#seq -

#number_comparison_age_compare_masked_dev.jsonl
#80% - [0.4875, 0.4875, 0.48, 0.5075, 0.49]
#80% - 49.05 +- 1.2685346465033547

#seq - 49.4±9.75
#seq - [0.59 , 0.55 , 0.49 , 0.45 , 0.39]

 


