In [28]:

import argparse
import logging
import os
import random
import json

import datasets
import torch
from datasets import load_dataset, load_from_disk, load_metric
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator
from transformers import (
    AutoConfig,
    AutoTokenizer,
    default_data_collator,
)
from promptsource.templates import DatasetTemplates

import sys
ROOT_DIR = "/cephfs/user/mikeeewang/summer_22/code/t-zero"
T0_DIR = os.path.join(ROOT_DIR,'t0')
sys.path.insert(1, T0_DIR)
from data_collator import DataCollatorForMultipleChoice
from model import ModelBase, ModelBase_with_confidence


from template_list import template_list
from retrieval import setup_retriever, retrieve, setup_retriever_shard

logger = logging.getLogger(__name__)

from collections import defaultdict

In [29]:
# configs
output_dir = "./tmp_output"
dataset_name = "super_glue"
dataset_config_name = "wic"
model_name_or_path = "bigscience/T0"
model_name_or_path = "bigscience/T0_3B"
config_name = None
template_name = None


debug = False
use_slow_tokenizer = False
tokenizer_name = False
per_device_eval_batch_size = 8
if_parallelize = True
pad_to_max_length = False
eval_all_templates = True
max_length = 1024
target_max_length = 256


In [30]:
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.info(accelerator.state)

# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
else:
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()


# Handle the output directory creation
if accelerator.is_main_process:
    os.makedirs(output_dir, exist_ok=True)
accelerator.wait_for_everyone()

# In distributed evaluation, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.

if dataset_name is not None:
    # Downloading and loading a dataset from the hub.
    if dataset_name == "anli":
        raw_datasets = load_dataset(dataset_name, split=dataset_config_name)
    else:
        raw_datasets = load_dataset(dataset_name, dataset_config_name, split="validation")

# Trim a number of evaluation examples
if debug:
    raw_datasets = raw_datasets.select(range(100))

# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if config_name:
    config = AutoConfig.from_pretrained(config_name)
elif model_name_or_path:
    config = AutoConfig.from_pretrained(model_name_or_path)
else:
    raise ValueError(
        "Either `args.config_name` or `args.model_name_or_path` should be provided."
    )

if tokenizer_name:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=not use_slow_tokenizer)
elif model_name_or_path:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=not use_slow_tokenizer)
else:
    raise ValueError(
        "You are instantiating a new tokenizer from scratch. This is not supported by this script."
        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
    )

if tokenizer.pad_token is None:
    for token in [tokenizer.eos_token, tokenizer.bos_token, tokenizer.sep_token]:
        if token is not None:
            tokenizer.pad_token = token
    if tokenizer.pad_token is None:
        raise ValueError("Please define a pad token id.")


model = ModelBase_with_confidence.from_config(
    config=config,
    model_name_or_path=model_name_or_path,
    parallelize=if_parallelize
)
print('done loading model')

# Preprocessing the datasets.
# First we tokenize all the texts.
padding = "max_length" if pad_to_max_length else False


# Get the prompt to apply and the possible targets.
prompts = DatasetTemplates(
    f"{dataset_name}"
    if dataset_config_name is None
    else f"{dataset_name}/{dataset_config_name}"
)

assert (dataset_name, dataset_config_name) in template_list

if eval_all_templates:
    template_names = template_list[(dataset_name, dataset_config_name)]
    print(f'evaluating all possible templates, total number:{len(template_names)}')
else:
    template_names = [template_name]

06/07/2022 17:03:39 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Mixed precision type: no

loading configuration file https://huggingface.co/bigscience/T0_3B/resolve/main/config.json from cache at /data2/mikeeewang/.cache/huggingface/7b128e6b48089ae556964fea17b39635abd0124e77f8fa30267896af500a4d6d.a54ecffc6881ea8ae0af8a0dca40a7bcd51ccf51d434d2f7d0569844f6fb1c60
Model config T5Config {
  "_name_or_path": "bigscience/T0_3B",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 5120,
  "d_kv": 64,
  "d_model": 2048,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 24,
  "num_heads": 32,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_

done loading model
evaluating all possible templates, total number:10


In [32]:

template_names = template_names[:1] # for debugging
# main loop over templates
all_results_step1 = []
all_results_step2 = []
for template_name in template_names:
    
    idx_to_data = {}

    print(f'evaluating tempalte {template_name} ...')
    template = prompts[template_name]
    column_names = raw_datasets.column_names
    
    #####################################
    # step 1 finding confident examples #
    #####################################

    ### preprocess dataset functions step 1 ###
    def preprocess_function_step1(examples):
        bs = len(examples[column_names[0]])

        input_texts = []
        target_texts = []
        indices = []
        answer_choices_texts = []
        for i in range(bs):
            ex = {
                k: examples[k][i]
                for k in column_names
            }
            input, target = template.apply(ex)
            ex_answer_choices = template.get_answer_choices_list(ex)
            assert target in ex_answer_choices
            input_texts.append(input)
            target_texts.append(target)
            answer_choices_texts.append(ex_answer_choices)

            idx = len(idx_to_data)
            idx_to_data[str(idx)] = {"input": input, "target": target, "answer_choices":ex_answer_choices}
            indices.append(idx)

        tokenized_inputs = tokenizer(
            input_texts,
            padding=padding,
            max_length=max_length,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = [
            tokenizer(
                ans_choi,
                padding=True,
                max_length=target_max_length,
                truncation=True,
            )
            for ans_choi in answer_choices_texts
        ]

        features = {
            k: [
                [elem for _ in range(len(tokenized_targets[idx]["input_ids"]))]
                for idx, elem in enumerate(v)
            ]
            for k, v in tokenized_inputs.items()
        }

        features["labels"] = [
            tokenized_targets[idx]["input_ids"]
            for idx in range(bs)
        ]
        features["labels_attention_mask"] = [
            tokenized_targets[idx]["attention_mask"]
            for idx in range(bs)
        ]
        features["targets"] = [
            answer_choices_texts[idx].index(t)
            for idx, t in enumerate(target_texts)
        ]
        features['indices'] = [
            indices[idx] for idx in range(bs)
        ]

        return features


    ### preprocess dataset ###
    with accelerator.main_process_first():
        print('preparing dataset ...')
        eval_dataset = raw_datasets.map(
            preprocess_function_step1, batched=True, remove_columns=column_names
        )
        
    print(idx_to_data['0'])
    # Log a few random samples from the eval set:
    for index in random.sample(range(len(eval_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {eval_dataset[index]}.")

    # DataLoaders creation:
    if pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForMultipleChoice(
            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
        )
    
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=per_device_eval_batch_size)


    # Use the device given by the `accelerator` object.
    if not if_parallelize:
        model.to(accelerator.device)

    # Prepare everything with our `accelerator`.
    eval_dataloader = accelerator.prepare(eval_dataloader)

    # Metrics
    metric = load_metric("accuracy")

    # Eval!
    total_batch_size = per_device_eval_batch_size * accelerator.num_processes

    logger.info("***** Running step one for mining confident examples *****")
    logger.info(f"  Num examples = {len(eval_dataset)}")
    logger.info(f"  Instantaneous batch size per device = {per_device_eval_batch_size}")
    logger.info(f"  Total eval batch size (w. parallel, distributed) = {total_batch_size}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(len(eval_dataloader)), disable=not accelerator.is_local_main_process)

    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            predictions, confidences = model(batch)

        for i in range(len(batch['indices'])):
            idx = int(batch['indices'][i])
            pred = predictions[i].detach().cpu().numpy()
            confidence = confidences[i].detach().cpu().numpy()
            idx_to_data[str(idx)]['prediction'] = pred
            idx_to_data[str(idx)]['confidence'] = confidence

        metric.add_batch(
            predictions=accelerator.gather(predictions),
            references=accelerator.gather(batch["targets"]),
        )
        progress_bar.update(1)

    eval_metric = metric.compute()
    accelerator.print(f"Result step one: {eval_metric}")

    results = {
        "dataset_name": dataset_name,
        "dataset_config_name": dataset_config_name,
        "template_name": template_name,
        "evaluation": eval_metric,
        "retrieval_database": None
    }
    all_results_step1.append(results)

    ###############################################
    # step 2 augment with most confident examples #
    ###############################################

    # find confident examples 
    cands = [(key,value['confidence']) for key, value in idx_to_data.items()]
    cands = sorted(cands, key = lambda x: x[1], reverse = True)
    cands = cands[:10] # take top 10
    aug_data_points = [idx_to_data[str(x[0])] for x in cands]
    # TODO use retrieval for this
    print(aug_data_points)
    # TODO
    concat_num = 1

    ### preprocess dataset functions step 2 ###
    def preprocess_function_step2(examples):
        bs = len(examples[column_names[0]])

        input_texts = []
        target_texts = []
        indices = []
        answer_choices_texts = []
        for i in range(bs):
            ex = {
                k: examples[k][i]
                for k in column_names
            }
            input, target = template.apply(ex)
            ex_answer_choices = template.get_answer_choices_list(ex)
            assert target in ex_answer_choices
            
            # augment with self examples
            picked_augs = random.choices(aug_data_points, k=concat_num)
            prefix = ""
            for ii in range(concat_num):
                picked_item = picked_augs[ii]
                prefix = prefix + picked_item['input'] + picked_item['answer_choices'][int(picked_item['prediction'])]
                prefix = prefix.rstrip('\n')
                prefix += '\n\n'
            input = prefix + input

            input_texts.append(input)
            target_texts.append(target)
            answer_choices_texts.append(ex_answer_choices)
            

        tokenized_inputs = tokenizer(
            input_texts,
            padding=padding,
            max_length=max_length,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = [
            tokenizer(
                ans_choi,
                padding=True,
                max_length=target_max_length,
                truncation=True,
            )
            for ans_choi in answer_choices_texts
        ]

        features = {
            k: [
                [elem for _ in range(len(tokenized_targets[idx]["input_ids"]))]
                for idx, elem in enumerate(v)
            ]
            for k, v in tokenized_inputs.items()
        }

        features["labels"] = [
            tokenized_targets[idx]["input_ids"]
            for idx in range(bs)
        ]
        features["labels_attention_mask"] = [
            tokenized_targets[idx]["attention_mask"]
            for idx in range(bs)
        ]
        features["targets"] = [
            answer_choices_texts[idx].index(t)
            for idx, t in enumerate(target_texts)
        ]
        features['indices'] = [
            indices[idx] for idx in range(bs)
        ]

        return features


    ### preprocess dataset ###
    with accelerator.main_process_first():
        print('preparing dataset ...')
        eval_dataset = raw_datasets.map(
            preprocess_function_step2, batched=True, remove_columns=column_names
        )

    # Log a few random samples from the eval set:
    for index in random.sample(range(len(eval_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {eval_dataset[index]}.")

    # DataLoaders creation:
    if pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForMultipleChoice(
            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
        )
    
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=per_device_eval_batch_size)


    # Use the device given by the `accelerator` object.
    if not if_parallelize:
        model.to(accelerator.device)

    # Prepare everything with our `accelerator`.
    eval_dataloader = accelerator.prepare(eval_dataloader)

    # Metrics
    metric = load_metric("accuracy")

    # Eval!
    total_batch_size = per_device_eval_batch_size * accelerator.num_processes

    logger.info("***** Running step two with augmentated samples *****")
    logger.info(f"  Num examples = {len(eval_dataset)}")
    logger.info(f"  Instantaneous batch size per device = {per_device_eval_batch_size}")
    logger.info(f"  Total eval batch size (w. parallel, distributed) = {total_batch_size}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(len(eval_dataloader)), disable=not accelerator.is_local_main_process)

    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            predictions, confidences = model(batch)

        metric.add_batch(
            predictions=accelerator.gather(predictions),
            references=accelerator.gather(batch["targets"]),
        )
        progress_bar.update(1)

    eval_metric = metric.compute()
    accelerator.print(f"Result step one: {eval_metric}")

    results = {
        "dataset_name": dataset_name,
        "dataset_config_name": dataset_config_name,
        "template_name": template_name,
        "evaluation": eval_metric,
        "retrieval_database": None
    }
    all_results_step2.append(results)


if accelerator.is_main_process:
    if output_dir is not None:
        output_name = f"results_step1__{dataset_name}__{dataset_config_name}.json"
        output_name = output_name.replace('/','_')
        with open(os.path.join(output_dir, output_name), "w") as f:
            json.dump(all_results_step1, f, indent=4)

if accelerator.is_main_process:
    if output_dir is not None:
        output_name = f"results_step2__{dataset_name}__{dataset_config_name}.json"
        output_name = output_name.replace('/','_')
        with open(os.path.join(output_dir, output_name), "w") as f:
            json.dump(all_results_step2, f, indent=4)

evaluating tempalte question-context-meaning-with-label ...
preparing dataset ...


  0%|          | 0/1 [00:00<?, ?ba/s]

06/07/2022 17:06:07 - INFO - __main__ - Sample 379 of the training set: {'input_ids': [[3520, 8, 1448, 96, 27673, 121, 43, 8, 337, 2530, 16, 175, 192, 16513, 58, 2163, 6, 465, 58, 216, 1790, 46, 7544, 3211, 30, 82, 161, 5, 389, 3211, 13, 28582, 5], [3520, 8, 1448, 96, 27673, 121, 43, 8, 337, 2530, 16, 175, 192, 16513, 58, 2163, 6, 465, 58, 216, 1790, 46, 7544, 3211, 30, 82, 161, 5, 389, 3211, 13, 28582, 5]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[465, 1], [2163, 1]], 'labels_attention_mask': [[1, 1], [1, 1]], 'targets': 0, 'indices': 379}.
06/07/2022 17:06:07 - INFO - __main__ - Sample 485 of the training set: {'input_ids': [[3520, 8, 1448, 96, 35, 15299, 15, 121, 43, 8, 337, 2530, 16, 175, 192, 16513, 58, 2163, 6, 465, 58, 695, 15299, 15, 3, 9, 2068, 5, 695, 15299, 15, 3, 9, 4550, 5], [3520, 8

{'input': 'Does the word "class" have the same meaning in these two sentences? Yes, No?\nAn emerging professional class.\nApologizing for losing your temper, even though you were badly provoked, showed real class.', 'target': 'No', 'answer_choices': ['No', 'Yes']}


06/07/2022 17:06:08 - INFO - __main__ - ***** Running step one for mining confident examples *****
06/07/2022 17:06:08 - INFO - __main__ -   Num examples = 638
06/07/2022 17:06:08 - INFO - __main__ -   Instantaneous batch size per device = 8
06/07/2022 17:06:08 - INFO - __main__ -   Total eval batch size (w. parallel, distributed) = 8


  0%|          | 0/80 [00:00<?, ?it/s]

TypeError: div(): argument 'input' (position 1) must be Tensor, not torch.return_types.max