In [1]:
import os, time
from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, CONFIG_NAME 
from pytorch_transformers import AdamW
from fp16 import FP16_Module, FP16_Optimizer
import json, csv
from multiprocessing import Pool
import torch
from torch.utils.data import Dataset, DataLoader, Sampler
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from collections import OrderedDict
import logging
import warnings
import copy
from metrics import compute_metrics

In [2]:
data_dir = "/root/LAMOL/lamol_data"
MODEL_BASE_DIR = "/data/model_runs"
# MODEL_DIR_NAME = "20210630T184801_mbs_SEQ_MAML"
# MODEL_DIR_NAME = "20210701T033914_mbs_SEQ"
MODEL_DIR_NAME = "20210701T073948_msb_SEQ_MAML"
# MODEL_DIR_NAME = "20210702T034545_msb_SEQ"
# MODEL_DIR_NAME = "20210701T180911_sbm_SEQ_MAML"

MODEL_DIR_NAME = "20210824T030642_mbs_SEQ_MAML" # Special of movie_steps101.model. Save special in between
MODEL_DIR_NAME = "20210824T035319_mbs_SEQ_MAML_savenetpibetween" # special of movie_steps101.model 
MODEL_DIR_NAME = "20210824T053734_mbs_SEQ_MAML_loadstatedict" # special load state dict
MODEL_DIR_NAME = "20210829T003703_mbs_SEQ_MAML_v2" # special load state dict
MODEL_DIR_NAME = "20210905T094223_sbm_SEQ_MAML"

MODEL_DIR = os.path.join(MODEL_BASE_DIR,MODEL_DIR_NAME)

# tasks = ['movie',  'scifact', 'boolq']
# tasks = ['movie', 'boolq',  'scifact']
tasks = [ 'scifact', 'boolq', 'movie']

DEVICE = 'cuda:0'
temperature_qa = 1.0
n_train_epochs = 1
num_updates = 5

train_batch_size = 3
test_batch_size = 4

gen_lm_sample_percentage = 0.05

top_k_qa = 20
top_p_qa = 0.

FILL_VAL = -1
n_gpus = 1
device_ids = [1]

logging.basicConfig(filename=f'{MODEL_DIR}/test_run.log', level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
MODEL_CLASSES = {
    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer, GPT2Config),
}

In [4]:
TASK_DICT = {
    "movie": {
               "train":os.path.join(data_dir,"movie_train.json"),
               "eval":os.path.join(data_dir,"movie_dev.json"),
               "test":os.path.join(data_dir,"movie_test.json"),
               "n_train_epochs": n_train_epochs 
    },
    "boolq": {
               "train":os.path.join(data_dir,"boolq_train.json"),
               "eval":os.path.join(data_dir,"boolq_dev.json"),
               "test":os.path.join(data_dir,"boolq_test.json"),
               "n_train_epochs": n_train_epochs 
    },
    "scifact": {
               "train":os.path.join(data_dir,"scifact_train.json"),
               "eval":os.path.join(data_dir,"scifact_dev.json"),
               "test":os.path.join(data_dir,"scifact_test.json"),
               "n_train_epochs": n_train_epochs 
    }
}


In [5]:
special_tokens = {"ans_token":'__ans__', "pad_token":'__pad__', "unk_token":'__unk__', "eos_token": '<|endoftext|>'}

model_class, tokenizer_class, config_class = MODEL_CLASSES['gpt2']
tokenizer = tokenizer_class.from_pretrained('gpt2')
tokenizer.add_tokens(list(special_tokens.values()))
special_token_ids = {k:tokenizer.convert_tokens_to_ids(v) for k,v in special_tokens.items()}


model_config = config_class.from_pretrained('gpt2')
model_config.vocab_size = len(tokenizer)
max_len = model_config.n_positions

tokens_weight = torch.ones([model_config.vocab_size], dtype=torch.float).to(DEVICE)
tokens_weight[special_token_ids["ans_token"]] = 5


MODEL_CLASS = model_class
TOKENIZER = tokenizer
SPECIAL_TOKENS = special_tokens
SPECIAL_TOKEN_IDS = special_token_ids
TOKENS_WEIGHT = tokens_weight
MODEL_CONFIG = model_config

print(SPECIAL_TOKENS)
print(SPECIAL_TOKEN_IDS)

{'ans_token': '__ans__', 'pad_token': '__pad__', 'unk_token': '__unk__', 'eos_token': '<|endoftext|>'}
{'ans_token': 50257, 'pad_token': 50258, 'unk_token': 50259, 'eos_token': 50256}


In [6]:
class QADataset(Dataset):
    def __init__(self, data_paths, data_type, gen_token, extra_data=[]):
        self.data_type = data_type
        self.gen_token = gen_token
        self.ans_token = SPECIAL_TOKEN_IDS["ans_token"]
        self.eos_token = SPECIAL_TOKEN_IDS["eos_token"]
        self.pad_token = SPECIAL_TOKEN_IDS["pad_token"]

        if not isinstance(data_paths, list):
            data_paths = [data_paths]

        data = []
        for data_path in data_paths:
            if not data_path:
                continue
            with open(data_path, "r") as f:
                raw_ds = json.load(f)
            raw_ds = map(lambda x: x["paragraphs"], raw_ds["data"])
            d = []
            for raw_d in raw_ds:
                d.extend(raw_d)
            data += d
        
        self.data = []
        self.max_a_len = 0
        if len(data) > 0:
            self.data_tokenization(data)

        if len(extra_data) > 0:
            extra_data = map(lambda x: self.etl_single_extra_data(x), extra_data)
            extra_data = list(filter(lambda x:x, extra_data))
            if gen_lm_sample_percentage > 0. and len(extra_data) == 0:
                logger.warning("No good extra data but sample percentage > 0!")
            self.data += extra_data


    def etl_single_extra_data(self, data):
        gen_token = data[0]
        data = ' '.join([str(datum) for datum in data[1:]])
        try:
            context = ""
            qa = data
            question, answer = re.split(str(SPECIAL_TOKEN_IDS["ans_token"]), qa)
            context = [int(c) for c in context.strip().split()]
            question = [int(q) for q in question.strip().split()]
            answer = [int(a) for a in re.sub(str(SPECIAL_TOKEN_IDS["eos_token"]), "", answer).strip().split()]
            uid = uuid.uuid1().hex
            data = self.parse_example(gen_token, context, question, answer, uid)
        except ValueError:
            return
        return data

    def concat_example(self, gen_token, c, sep_token, q, ans_token, a, eos_token):
        example = sep_token + q + ans_token + a
        if len(example) + 1 > max_len:
            logger.warning('an example with len {} is too long!'.format(len(example) + 1))
            return
        example = gen_token + c[:max_len-len(example)-1] + example + eos_token
        return example

    def parse_example(self, gen_token, context, question, answer, idx):
        cq_example = self.concat_example([], context, [], question, [self.ans_token], [], [])
        cqa_example = self.concat_example([], context, [], question, [self.ans_token], answer, [])
        Y_example = self.concat_example([], [], [], [], [], answer, [self.eos_token])
        Y_example = [FILL_VAL] * (len(cqa_example) - len(Y_example)) + Y_example
        gen_X_example = self.concat_example([gen_token], context, [], question, [self.ans_token], answer, [])
        gen_Y_example = self.concat_example([], context, [], question, [self.ans_token], answer, [self.eos_token])
        return cq_example, len(cq_example), cqa_example, len(cqa_example), Y_example, gen_X_example, gen_Y_example, idx

    def parallel_tokenization(self, d):
        # ADD MAX LENGTH FOR MODEL SO IT DOESNT SHOW WARNING
        # OLD VERSION OF PYTORCH HUGGINGFACE DOESNT HAVE MAX LENGTH!!!!!
        # Suppress the warnings instead! https://stackoverflow.com/questions/14463277/how-to-disable-python-warnings
        # Still doesn't work. idk what to do. we can delete all the warnings catches here
        examples = []
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            context = TOKENIZER.encode(d["context"])
        max_a_len = 0
        for qa in d["qas"]:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                question = TOKENIZER.encode(qa["question"])

            raw_answers = qa["answers"]
            if len(raw_answers) == 0:
                assert qa["is_impossible"]
                raw_answers.append({"text": ""})

            answer = []
            for i, raw_answer in enumerate(raw_answers):
                answer.extend(TOKENIZER.encode(raw_answer["text"]))
                if i != len(raw_answers) - 1:
                    answer.append(self.pad_token)
            max_a_len = max(max_a_len, len(answer))

            examples.append(self.parse_example(self.gen_token, context, question, answer, qa.get("id", 0)))
        return examples, max_a_len

    def data_tokenization(self, data):
        with Pool(4) as pool:
            data = pool.map(self.parallel_tokenization, data)
        for datum, max_a_len in data:
            self.data.extend(datum)
            self.max_a_len = max(self.max_a_len, max_a_len)

    def sort(self):
        self.data.sort(key=lambda x: len(x[0]))
        return self

    def sort_by_index(self):
        self.data.sort(key=lambda x: x[-1])

    def get_indices(self):
        return [d[-1] for d in self.data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [7]:
# In[9]:


class DynamicBatchSampler(Sampler):
    def __init__(self, dataset, data_type, max_batch_size):
        self.dataset = dataset
        self.data_type = data_type
        if data_type == "train":
            self.batch_size = train_batch_size
        else:
            self.batch_size = test_batch_size
        self.n_samples = len(dataset)
        self.max_batch_size = max_batch_size

    def __iter__(self):
        if self.data_type == "test":
            indices = range(self.n_samples)
        else:
            indices = np.random.permutation(self.n_samples)
        max_len, cnt, st = 0, 0, 0
        batch = []
        for ed, idx in enumerate(indices):
            ln = len(self.dataset[idx][2])
            if max(max_len, ln)**LEN_FACTOR * (ed - st + 1) > self.batch_size[cnt]:
                st = ed
                cnt += 1
                max_len = 0
                if cnt == n_gpus:
                    yield batch
                    cnt = 0
                    batch = []
            max_len = max(max_len, ln)
            batch.append(idx)
            if len(batch) == self.max_batch_size and self.data_type == "train":
                yield batch
                cnt, max_len, st = 0, 0, ed
                batch = []
        if len(batch) > 0:
            yield batch

    def __len__(self):
        raise NotImplementedError


# In[10]:


def dynamic_collate_fn(data, batch_size):

    def local_collate():
        null_counter = 0
        _cqs, _len_cqs, _cqas, _len_cqas, _Ys, _gen_Xs, _gen_Ys = [], [], [], [], [], [], []
        Y_max_len = max(len(data[j][4]) for j in range(st, ed))
        cq_max_len = max(len(data[j][0]) for j in range(st, ed))
        for j in range(st, ed):
            if None in data[j] or [] in data[j]:
                null_counter+=1
                logger.warning('null example in collate_fn, count: {}'.format(null_counter))
                continue

            pad_len = cqa_max_len - len(data[j][2])

            _cqs.append(pad_to_max_len(data[j][0], cq_max_len-len(data[j][0]), SPECIAL_TOKEN_IDS["pad_token"]))
            _len_cqs.append(data[j][1])
            _cqas.append(pad_to_max_len(data[j][2], pad_len, SPECIAL_TOKEN_IDS["pad_token"]))
            _len_cqas.append(data[j][3])
            _Ys.append(pad_to_max_len(data[j][4], Y_max_len - len(data[j][4]), FILL_VAL))
            _gen_Xs.append(pad_to_max_len(data[j][5], pad_len, SPECIAL_TOKEN_IDS["pad_token"]))
            _gen_Ys.append(pad_to_max_len(data[j][6], pad_len, FILL_VAL))

        cqs.append(torch.tensor(_cqs))
        len_cqs.append(torch.tensor(_len_cqs))
        cqas.append(torch.tensor(_cqas))
        len_cqas.append(torch.tensor(_len_cqas))
        Ys.append(torch.tensor(_Ys))
        gen_Xs.append(torch.tensor(_gen_Xs))
        gen_Ys.append(torch.tensor(_gen_Ys))

    cqs, len_cqs, cqas, len_cqas, Ys, gen_Xs, gen_Ys = [], [], [], [], [], [], []
    cqa_max_len, cnt, st = 0, 0, 0
    for ed, datum in enumerate(data):
        ln = len(datum[2]) # use cqas to calibrate
        if max(cqa_max_len, ln)**LEN_FACTOR * (ed - st + 1) > batch_size[cnt]:
            local_collate()
            cnt += 1
            cqa_max_len = 0
            st = ed
        cqa_max_len = max(cqa_max_len, ln)
    ed += 1  # otherwise ed will be len(data)-1
    local_collate()

    return cqs, len_cqs, cqas, len_cqas, Ys, gen_Xs, gen_Ys


# In[11]:



def varlen_collate_fn(data):
    batch_size = (len(data) + n_gpus - 1) // n_gpus
    cqs = torch.tensor(pad_all_to_max_len([datum[0] for datum in data], SPECIAL_TOKEN_IDS["pad_token"])).split(batch_size)
    len_cqs = torch.tensor([datum[1] for datum in data]).split(batch_size)
    cqas = torch.tensor(pad_all_to_max_len([datum[2] for datum in data], SPECIAL_TOKEN_IDS["pad_token"])).split(batch_size)
    len_cqas = torch.tensor([datum[3] for datum in data]).split(batch_size)
    Ys = torch.tensor(pad_all_to_max_len([datum[4] for datum in data], FILL_VAL)).split(batch_size)
    gen_Xs = torch.tensor(pad_all_to_max_len([datum[5] for datum in data], SPECIAL_TOKEN_IDS["pad_token"])).split(batch_size)
    gen_Ys = torch.tensor(pad_all_to_max_len([datum[6] for datum in data], FILL_VAL)).split(batch_size)
    return list(cqs), list(len_cqs), list(cqas), list(len_cqas), list(Ys), list(gen_Xs), list(gen_Ys)


# In[12]:


def pad_to_max_len(l, pad_len, val):
    return l + [val] * pad_len
def pad_all_to_max_len(ls, val):
    max_len = max(len(l) for l in ls)
    return [pad_to_max_len(l, max_len-len(l), val) for l in ls]


# In[13]:


def create_dataloader(dataset, data_type, max_batch_size=1000000000):
    if data_type == "train":
        batch_size = train_batch_size
    else:
        batch_size = test_batch_size

    if isinstance(batch_size, list):
        collate_fn=lambda x,bs=batch_size: dynamic_collate_fn(x, bs)
        shuffle = False
        batch_size = 1
        batch_sampler = DynamicBatchSampler(dataset, data_type, max_batch_size)
    else:
        collate_fn=lambda x: varlen_collate_fn(x)
#         shuffle = not (data_type != "train" or args.debug)
        shuffle = False
        batch_sampler = None

    dataloader =  DataLoader(dataset, num_workers=4,
                             collate_fn=collate_fn,
                             shuffle=shuffle,
                             batch_size=batch_size,
                             batch_sampler=batch_sampler)
    return dataloader

In [8]:
def remove_id(idx, need_process, all_pasts):
    assert idx in need_process
    del need_process[idx]
    for layer_id in range(MODEL_CONFIG.n_layer):
        all_pasts[layer_id][idx] = 0
        
        
def sample_sequence(model, need_process, qa_results, all_pasts, max_tot_lens):
    while len(need_process) > 0:                       # While there is still any need_process
        first_id = next(iter(need_process))            # The first one to process
        shortest_len = len(qa_results[first_id])       # The shortest length is the length of itself?
#         decode_batch_size = int(args.memory_sizes[0] * MEMORY_FACTOR[args.seq_train_type] // (shortest_len+1)**LEN_FACTOR)
        decode_batch_size = test_batch_size
        it = iter(need_process)                        # it is iterable of need_process
        stop = False
        remove_ids = []
        while not stop:
            batch_ids, input_ids, past = [], [], [[] for _ in range(MODEL_CONFIG.n_layer)]
            while True:
                try:
                    cur_id = next(it)                   # let the current id be the next batch of need_process
                    if len(qa_results[cur_id]) > shortest_len:  # if the length is too long, just stop
                        stop = True
                        break
                    batch_ids.append(cur_id)            
                    input_ids.append(qa_results[cur_id][-1:])
                    for layer_id in range(MODEL_CONFIG.n_layer):
                        past[layer_id].append(all_pasts[layer_id][cur_id])
                    if len(input_ids) == decode_batch_size:
                        break
                except StopIteration:                    # if there is no more id in need_process, just stop
                    stop = True
                    break

            n_inputs = len(input_ids)
            if n_inputs == 0:
                break
            input_ids = torch.stack(input_ids)
            for layer_id in range(MODEL_CONFIG.n_layer):
                past[layer_id] = torch.stack(past[layer_id], dim=1)
            all_outputs = model(input_ids=input_ids.cuda(), past=past)

            outputs = all_outputs[0]
            pasts = all_outputs[1]

            next_logits = outputs[..., -1, :] / temperature_qa
            next_tokens = logits_to_tokens(next_logits).cpu()

            for i, cur_id in enumerate(batch_ids):
                if next_tokens[i] == SPECIAL_TOKEN_IDS["eos_token"]:
                    remove_ids.append(cur_id)
                else:
                    qa_results[cur_id] = torch.cat((qa_results[cur_id], next_tokens[i]))
                    if len(qa_results[cur_id]) in [max_tot_lens[cur_id], max_len]:
                        remove_ids.append(cur_id)
                    else:
                        for layer_id in range(MODEL_CONFIG.n_layer):
                            all_pasts[layer_id][cur_id] = pasts[layer_id][:, i].type(torch.half)
        for idx in remove_ids:
            remove_id(idx, need_process, all_pasts)

def get_gen_token(task):
    return '__' + task + '__'

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    # assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value
    return logits

def logits_to_tokens(next_logits):
    filtered_logits = top_k_top_p_filtering(next_logits, top_k=top_k_qa, top_p=top_p_qa)
    log_probs = F.softmax(filtered_logits, dim=-1)
    next_tokens = torch.multinomial(log_probs, num_samples=1)
    return next_tokens

In [9]:
def get_test_score(task_eval,qa_results,score_dict):

    score = compute_metrics(
            qa_results,
            bleu='iwslt.en.de' in task_eval or 'multinli.in.out' in task_eval,
            dialogue='woz.en' in task_eval,
            rouge='cnn_dailymail' in task_eval,
            logical_form='wikisql' in task_eval,
            corpus_f1='zre' in task_eval
    )
    score_dict[task_eval] = score


In [10]:
def test_one_to_one(task_load, task_eval, model, score_dict):
    tic_TASK = time.time()
    logger.info("start to test { task: %s (load) %s (eval)}" % (task_load, task_eval))
    print("start to test { task: %s (load) %s (eval)}" % (task_load, task_eval))

    
    # Test Dataset : Support (Train QAData) Query (Test QAData)
    support_qadata = QADataset(TASK_DICT[task_eval]["train"], "train", SPECIAL_TOKEN_IDS[task_load])
    test_qadata = QADataset(TASK_DICT[task_eval]["test"] , "test", SPECIAL_TOKEN_IDS[task_load]).sort()
    
    max_a_len = test_qadata.max_a_len
    n_examples = len(test_qadata)
    logger.info("len of test dataset: {}".format(n_examples))
    print("len of test dataset: {}".format(n_examples))
    
    ##### Make dataloaders for that particular dataset #####
    support_dataloader = create_dataloader(support_qadata, "train")
    test_dataloader = create_dataloader(test_qadata, "test")
    
    
    ##### Stream from that dataset's dataloader #####
    iter_support_dataloader = iter(support_dataloader)
    iter_test_dataloader = iter(test_dataloader)
    
    # Make model_0
    model.load_state_dict(model.state_dict)
    model_0 = model
    

    need_process = OrderedDict()
    # qa_results is qa_results[cnt]
    qa_results = [0 for _ in range(n_examples)]
    # All pasts is shape all_pasts[layer_id][cnt]
    all_pasts = [[0 for _ in range(n_examples)] for __ in range(MODEL_CONFIG.n_layer)]
    # max_tot_lens is qa_results[cnt]
    max_tot_lens = [0 for _ in range(n_examples)]

    cnt = 0
    n_steps = 0
    
    while True:
#     for n_steps, (cq, len_cq, cqa, len_cqa, Y, genX, genY) in enumerate(test_dataloader):
        # 1. Get the support data from the train dataloader
        #    and the query data from the test dataloader
        # Assume that query data >> support data!  - cant do this. try twice.
        try:
            _, _, support_x, _, support_y, _, _ = next(iter_support_dataloader)
        except StopIteration:
            iter_support_dataloader = iter(support_dataloader)
            _, _, support_x, _, support_y, _, _ = next(iter_support_dataloader)
        try:
            query_x, query_x_len, query_x_cqa, _, query_y, _, _ = next(iter_test_dataloader) # Let query get the CQ!
        except StopIteration:
            break
            
        
        # Different inputs for train and test -> train with batch 3 and test with batch 1
        n_inputs_train = sum(_cqa.shape[0] for _cqa in support_x)
        n_inputs = sum(_cqa.shape[0] for _cqa in query_x)

        # Since we only have 1 GPU, just use the first one, it will separate batches according to the device IDS
        support_x = support_x[0]
        support_y = support_y[0]
        query_x = query_x[0]
        query_y = query_y[0]
        query_x_len = query_x_len[0] # an array of query x lengths, but test batch size is only1??
        query_x_cqa = query_x_cqa[0] #EXTRA DEBUG

        support_x = support_x.to(DEVICE)
        support_y = support_y.to(DEVICE)
        query_x = query_x.to(DEVICE)
        query_y = query_y
        query_x_cqa = query_x_cqa.to(DEVICE) #EXTRA DEBUG
        
        
        ### START Adaptation Phase ###
        # 2. Reinitialize model with parameters from model_path
        model = copy.deepcopy(model_0)
        model.train()
        
        # Training loss function
        train_loss_fct = CrossEntropyLoss(ignore_index=FILL_VAL, weight=TOKENS_WEIGHT)
        
        # Optimizer
        max_grad_norm=1
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        optimizer = AdamW(optimizer_grouped_parameters, lr=6.25e-5, eps=1e-4)
        optimizer = FP16_Optimizer(optimizer, static_loss_scale=None, dynamic_loss_scale=True,
                                           dynamic_loss_args={'scale_window': 100, 'min_scale': 1, 'delayed_shift': 2})

        
        
        # 3. Update the weights with the support set
        # May update for several steps
        for i in range(num_updates):

            qa_logits = model(support_x)
            # Somehow it also returns attentions in [1]?, this is selecting 0 of what WrapModel is doing 
            qa_logits = qa_logits[0]
            qa_loss = train_loss_fct(qa_logits.transpose(1,2), support_y)
            loss = qa_loss

            logger.info(f"[DEBUG] Adaptation loss: {qa_loss.item()}")
            # Update Optimizer
            optimizer.backward(loss, update_master_grads=False) # instead of loss.backward() for fp16
            optimizer.update_master_grads()
            optimizer.clip_master_grads(max_grad_norm)
            optimizer.step()
            # Ignore this for now
#             if not optimizer.overflow:
#                 for i in range(n_inputs):
#                     scheduler.step()
            optimizer.zero_grad()
        ### END Adaptation Phase ###
        
        model.eval()

        ### START Meta-Learning Phase ###
        # 4. After Adaptation, use the query set for test (CQ ONLY)
        # model() returns Tuple of length 2: 
        #  The [0] is a  torch.Size([1, 225, 50260]), and the [1] is 12 of torch.Size([2, 1, 12, 225, 64])
        # Thinking that the [0] is the actual output and [1] is the pasts?
        all_outputs = model(query_x)
        outputs = all_outputs[0]
        pasts = all_outputs[1]
        next_logits = outputs[range(n_inputs), query_x_len-1, :] / temperature_qa
        next_tokens = logits_to_tokens(next_logits).cpu()
        
        
        # EXTRA FOR COMPARE
        qa_logits = model(query_x_cqa)[0]
        qa_loss = train_loss_fct(qa_logits.transpose(1,2), query_y.to(DEVICE))
        logger.info(f"[DEBUG] QUERY LOSS: {qa_loss.item()}")
        
        # Maybe this is not needed in testing since n_inputs is only 1??
        for batch_i in range(n_inputs):
            # max total length = max answer length + length of cq
            max_tot_lens[cnt] = max_a_len + test_qadata[cnt][1] 
            # add the cq of that particular batch to qa_results (Change it to cpu first!)
            qa_results[cnt] = query_x.cpu()[batch_i][:query_x_len[batch_i]]
            
            # If the next tokens is not eos
            if next_tokens[batch_i] != SPECIAL_TOKEN_IDS["eos_token"]:
                # Concat the result
                qa_results[cnt] = torch.cat((qa_results[cnt], next_tokens[batch_i]))
                # if the length is not max yet -> MAXTOT 225 1024
                if len(qa_results[cnt]) not in [max_tot_lens[cnt], max_len]:
                    # Append need_process of that cnt
                    need_process.update([[cnt, None]])
                    # Update all pasts
                    for layer_id in range(MODEL_CONFIG.n_layer):
                        all_pasts[layer_id][cnt] = pasts[layer_id][:, batch_i, ..., :query_x_len[batch_i], :].type(torch.half)
            
            # Try sample_sequence here! it will get all need_process (should be only 1 batch, and generate all!)
            sample_sequence(model, need_process, qa_results, all_pasts, max_tot_lens)
            
            logger.info(f"[ERROR_ANALYSIS] {task_eval} {cnt}/{n_examples} Predicted Answer {TOKENIZER.decode(qa_results[cnt].tolist())}")
            logger.info(f"[ERROR_ANALYSIS] {task_eval} {cnt}/{n_examples} Predicted Tokens {qa_results[cnt].tolist()[query_x_len[batch_i]:]}")

            # Do the score calculation here
            # The answer of that particular batch to list
            Y = query_y[batch_i].tolist()
            Y = list(filter(lambda x: x != -1, Y))[:-1]  # remove eos from the answer
            logger.info(f"[ERROR_ANALYSIS] {task_eval} {cnt}/{n_examples} Actual Tokens {Y}")
            Y = ' '.join([str(y) for y in Y]).split(str(SPECIAL_TOKEN_IDS["pad_token"]))
            Y = [TOKENIZER.decode(list(map(int, y.split()))) for y in Y]
            # Change the QA Results to a decoded version of real answer and predicted answer
            qa_results[cnt] = [TOKENIZER.decode(qa_results[cnt].tolist()[query_x_len[batch_i]:]), Y]
            print(f"Predict vs Actual {cnt}/{n_examples}", qa_results[cnt])
            logger.info(f"[ERROR_ANALYSIS] {task_eval} {cnt}/{n_examples} Actual Answer {Y}")
            logger.info(f"[ERROR_ANALYSIS] {task_eval} {cnt}/{n_examples} Predict vs Actual {qa_results[cnt]}")
            
            cnt += 1
        n_steps += 1
        
    toc_TASK = time.time() - tic_TASK
    logger.info(f'[TIME] TASK {(task_load, task_eval)} {toc_TASK}')
    
    get_test_score(task_eval, qa_results, score_dict)
    print(score_dict)

    model_dir = model.model_dir
    results_path = os.path.join(model_dir,f"qa_{task_eval}.csv")
    with open(results_path, "w",encoding="utf-8") as f:
        qa_writer = csv.writer(f,delimiter=',')
        qa_writer.writerow(["y","pred"])
        for pred, y in qa_results:
            qa_writer.writerow([y,pred]) 

    return model, score_dict

In [11]:
%%time

for task in tasks:
    
    model_path = os.path.join(MODEL_DIR, f"{task}.model")
    config_path = os.path.join(MODEL_DIR,CONFIG_NAME)

    gen_token = get_gen_token(task)
    TOKENIZER.add_tokens([gen_token])
    SPECIAL_TOKENS[task] = gen_token
    SPECIAL_TOKEN_IDS[task] = TOKENIZER.convert_tokens_to_ids(gen_token)
#     model_config = CONFIG_CLASS.from_json_file(config_path) # Already defined
    model = MODEL_CLASS(model_config).cuda()
    # Don't load state dict here, load for every adaptation phase!
    
#     print(model)
    print(model_path)
    
    global TOKENS_WEIGHT
    if len(TOKENIZER) != TOKENS_WEIGHT.shape[0]:
        TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda()))
    
    model.resize_token_embeddings(len(TOKENIZER))
    model = FP16_Module(model)
    
    
    model.model_dir = MODEL_DIR
    model.model_path = model_path
    # Try Loading the state dict like this!
    model.state_dict = torch.load(model.model_path, map_location='cuda:0')
    logger.info(f"task: {task}")
    score_dict = {k:None for k in tasks}
    
    for task_eval in tasks:
        test_one_to_one(task, task_eval, model, score_dict)
    logger.info("score: {}".format(score_dict))

    with open(os.path.join(MODEL_DIR, f"metrics-{task}.json"),"w") as f:
        json.dump(score_dict, f)

/data/model_runs/20210905T094223_sbm_SEQ_MAML/scifact.model
start to test { task: scifact (load) scifact (eval)}
len of test dataset: 188
Predict vs Actual 0/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 1/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 2/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 3/188 [' CONCLUSIONS', [' SUPPORTS']]
Predict vs Actual 4/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 5/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 6/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 7/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 8/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 9/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 10/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 11/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 12/188 ['', [' SUPPORTS']]
Predict vs Actual 13/188 [' F', [' SUPPORTS']]
Predict vs Actual 14/188 ['\n __unk__ SUPP', [' SUPPORTS']]
Predict vs Actual 15/188 ['', [' REFUTES']]
Predict vs Actual 16/188 [' __

Predict vs Actual 156/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 157/188 [' REFUTES', [' REFUTES']]
Predict vs Actual 158/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 159/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 160/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 161/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 162/188 [' REFUTES', [' SUPPORTS']]
Predict vs Actual 163/188 [' REFUTES', [' SUPPORTS']]
Predict vs Actual 164/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 165/188 [' REFUTES', [' SUPPORTS']]
Predict vs Actual 166/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 167/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 168/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 169/188 [' REFUTES', [' SUPPORTS']]
Predict vs Actual 170/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 171/188 [' REFUTES', [' SUPPORTS']]
Predict vs Actual 172/188 [' SUPPORTS', [' REFUTES']]
Predict vs Actual 173/188 [' SUPPORTS', [' SUPPORTS']]
Predict vs Actual 174/

Predict vs Actual 136/2807 [' PER', [' True']]
Predict vs Actual 137/2807 [' RE', [' True']]
Predict vs Actual 138/2807 [' SUP', [' True']]
Predict vs Actual 139/2807 [' RE', [' True']]
Predict vs Actual 140/2807 [' __unk__', [' True']]
Predict vs Actual 141/2807 ['', [' True']]
Predict vs Actual 142/2807 [' RE', [' False']]
Predict vs Actual 143/2807 [' SUP', [' True']]
Predict vs Actual 144/2807 [' SUP', [' False']]
Predict vs Actual 145/2807 [' SUP', [' False']]
Predict vs Actual 146/2807 [' RE', [' False']]
Predict vs Actual 147/2807 [' SUP', [' True']]
Predict vs Actual 148/2807 [' SUP', [' False']]
Predict vs Actual 149/2807 [' RE', [' False']]
Predict vs Actual 150/2807 [' __pad__', [' True']]
Predict vs Actual 151/2807 [' SUP', [' False']]
Predict vs Actual 152/2807 [' __ans__', [' False']]
Predict vs Actual 153/2807 [' __unk__', [' False']]
Predict vs Actual 154/2807 [' SUP', [' False']]
Predict vs Actual 155/2807 [' SUP', [' True']]
Predict vs Actual 156/2807 [' __unk__', [' 

Predict vs Actual 308/2807 [' LEG', [' False']]
Predict vs Actual 309/2807 [' SUP', [' True']]
Predict vs Actual 310/2807 [' CON', [' True']]
Predict vs Actual 311/2807 [' S', [' True']]
Predict vs Actual 312/2807 ['', [' True']]
Predict vs Actual 313/2807 [' __unk__', [' True']]
Predict vs Actual 314/2807 [' __scifact__', [' True']]
Predict vs Actual 315/2807 [' __unk__', [' True']]
Predict vs Actual 316/2807 [' SUP', [' False']]
Predict vs Actual 317/2807 [' __unk__', [' False']]
Predict vs Actual 318/2807 [' __unk__', [' True']]
Predict vs Actual 319/2807 [' __unk__', [' False']]
Predict vs Actual 320/2807 [' SUP', [' False']]
Predict vs Actual 321/2807 [' __pad__', [' False']]
Predict vs Actual 322/2807 ['', [' True']]
Predict vs Actual 323/2807 [' __ans__', [' True']]
Predict vs Actual 324/2807 [' SUP', [' False']]
Predict vs Actual 325/2807 [' CONT', [' False']]
Predict vs Actual 326/2807 [' OFF', [' True']]
Predict vs Actual 327/2807 ['', [' False']]
Predict vs Actual 328/2807 [

Predict vs Actual 488/2807 [' __unk__', [' False']]
Predict vs Actual 489/2807 [' SUP', [' True']]
Predict vs Actual 490/2807 [' __ans__', [' False']]
Predict vs Actual 491/2807 ['', [' False']]
Predict vs Actual 492/2807 [' COMP', [' True']]
Predict vs Actual 493/2807 ['', [' True']]
Predict vs Actual 494/2807 [' __ans__', [' True']]
Predict vs Actual 495/2807 ['', [' True']]
Predict vs Actual 496/2807 [' IN', [' False']]
Predict vs Actual 497/2807 ['', [' True']]
Predict vs Actual 498/2807 ['', [' False']]
Predict vs Actual 499/2807 [' SUP', [' False']]
Predict vs Actual 500/2807 [' SUP', [' False']]
Predict vs Actual 501/2807 [' SUP', [' False']]
Predict vs Actual 502/2807 [' SUP', [' True']]
Predict vs Actual 503/2807 [' SUP', [' False']]
Predict vs Actual 504/2807 ['', [' True']]
Predict vs Actual 505/2807 [' SUP', [' True']]
Predict vs Actual 506/2807 [' F', [' False']]
Predict vs Actual 507/2807 [' __unk__', [' True']]
Predict vs Actual 508/2807 ['', [' False']]
Predict vs Actua

Predict vs Actual 668/2807 [' S', [' True']]
Predict vs Actual 669/2807 [' -', [' True']]
Predict vs Actual 670/2807 ['P', [' False']]
Predict vs Actual 671/2807 [' RE', [' False']]
Predict vs Actual 672/2807 [' SUP', [' False']]
Predict vs Actual 673/2807 ['', [' True']]
Predict vs Actual 674/2807 [' SUP', [' False']]
Predict vs Actual 675/2807 [' SUP', [' True']]
Predict vs Actual 676/2807 [' SUP', [' True']]
Predict vs Actual 677/2807 [' RE', [' True']]
Predict vs Actual 678/2807 [' RE', [' False']]
Predict vs Actual 679/2807 ['', [' True']]
Predict vs Actual 680/2807 [' SUP', [' True']]
Predict vs Actual 681/2807 [' __unk__', [' True']]
Predict vs Actual 682/2807 ['', [' False']]
Predict vs Actual 683/2807 ['.', [' True']]
Predict vs Actual 684/2807 ['-', [' False']]
Predict vs Actual 685/2807 ['', [' True']]
Predict vs Actual 686/2807 [' PART', [' True']]
Predict vs Actual 687/2807 [' SUP', [' True']]
Predict vs Actual 688/2807 [' __unk__', [' True']]
Predict vs Actual 689/2807 ['

Predict vs Actual 848/2807 [' SUP', [' False']]
Predict vs Actual 849/2807 ['.', [' True']]
Predict vs Actual 850/2807 ['', [' True']]
Predict vs Actual 851/2807 [' S', [' True']]
Predict vs Actual 852/2807 [' SUP', [' False']]
Predict vs Actual 853/2807 [' SUP', [' False']]
Predict vs Actual 854/2807 ['', [' False']]
Predict vs Actual 855/2807 [' SUP', [' True']]
Predict vs Actual 856/2807 [' OF', [' True']]
Predict vs Actual 857/2807 [' SUP', [' True']]
Predict vs Actual 858/2807 ['', [' False']]
Predict vs Actual 859/2807 [' SUP', [' True']]
Predict vs Actual 860/2807 [' SUP', [' False']]
Predict vs Actual 861/2807 [' SUP', [' False']]
Predict vs Actual 862/2807 ['\n', [' True']]
Predict vs Actual 863/2807 [' SUP', [' True']]
Predict vs Actual 864/2807 [' RE', [' False']]
Predict vs Actual 865/2807 [' SUP', [' False']]
Predict vs Actual 866/2807 [' SUP', [' False']]
Predict vs Actual 867/2807 [' SUP', [' False']]
Predict vs Actual 868/2807 [' SUP', [' True']]
Predict vs Actual 869/2

Predict vs Actual 1028/2807 [' SUP', [' True']]
Predict vs Actual 1029/2807 ['', [' False']]
Predict vs Actual 1030/2807 [' SUP', [' False']]
Predict vs Actual 1031/2807 [' SUP', [' True']]
Predict vs Actual 1032/2807 [' SUP', [' False']]
Predict vs Actual 1033/2807 [' SUP', [' True']]
Predict vs Actual 1034/2807 ['', [' False']]
Predict vs Actual 1035/2807 [' S', [' True']]
Predict vs Actual 1036/2807 ['', [' True']]
Predict vs Actual 1037/2807 [' __unk__', [' False']]
Predict vs Actual 1038/2807 [' SUP', [' True']]
Predict vs Actual 1039/2807 [' FUN', [' True']]
Predict vs Actual 1040/2807 [' SUP', [' True']]
Predict vs Actual 1041/2807 [' IND', [' True']]
Predict vs Actual 1042/2807 [' SUP', [' True']]
Predict vs Actual 1043/2807 ['', [' True']]
Predict vs Actual 1044/2807 ['', [' False']]
Predict vs Actual 1045/2807 [' SUP', [' False']]
Predict vs Actual 1046/2807 [' RE', [' True']]
Predict vs Actual 1047/2807 [' RE', [' True']]
Predict vs Actual 1048/2807 [' END', [' True']]
Predi

Predict vs Actual 1204/2807 [' SUP', [' False']]
Predict vs Actual 1205/2807 [' SUP', [' True']]
Predict vs Actual 1206/2807 [' SUP', [' True']]
Predict vs Actual 1207/2807 ['', [' True']]
Predict vs Actual 1208/2807 [' F', [' False']]
Predict vs Actual 1209/2807 [' INTER', [' False']]
Predict vs Actual 1210/2807 [' SUP', [' True']]
Predict vs Actual 1211/2807 [' SUP', [' True']]
Predict vs Actual 1212/2807 [' DES', [' True']]
Predict vs Actual 1213/2807 [' OF', [' True']]
Predict vs Actual 1214/2807 ['', [' False']]
Predict vs Actual 1215/2807 [' IND', [' False']]
Predict vs Actual 1216/2807 [' SUP', [' True']]
Predict vs Actual 1217/2807 ['', [' False']]
Predict vs Actual 1218/2807 ['', [' True']]
Predict vs Actual 1219/2807 [' TOP', [' False']]
Predict vs Actual 1220/2807 [' __unk__', [' False']]
Predict vs Actual 1221/2807 ['', [' True']]
Predict vs Actual 1222/2807 ['', [' True']]
Predict vs Actual 1223/2807 ['P', [' True']]
Predict vs Actual 1224/2807 [' SUP', [' True']]
Predict 

Predict vs Actual 1380/2807 [' RES', [' False']]
Predict vs Actual 1381/2807 ['\n', [' False']]
Predict vs Actual 1382/2807 [' SUP', [' True']]
Predict vs Actual 1383/2807 ['', [' True']]
Predict vs Actual 1384/2807 [' T', [' True']]
Predict vs Actual 1385/2807 [' SUP', [' True']]
Predict vs Actual 1386/2807 ['F', [' False']]
Predict vs Actual 1387/2807 [' IND', [' False']]
Predict vs Actual 1388/2807 [' F', [' False']]
Predict vs Actual 1389/2807 [' SUP', [' False']]
Predict vs Actual 1390/2807 [' SUP', [' True']]
Predict vs Actual 1391/2807 [' REL', [' True']]
Predict vs Actual 1392/2807 [' RE', [' False']]
Predict vs Actual 1393/2807 [' SUP', [' True']]
Predict vs Actual 1394/2807 [' SUP', [' False']]
Predict vs Actual 1395/2807 [' SH', [' True']]
Predict vs Actual 1396/2807 [' SUB', [' True']]
Predict vs Actual 1397/2807 ['', [' False']]
Predict vs Actual 1398/2807 [' __unk__', [' False']]
Predict vs Actual 1399/2807 [' SH', [' True']]
Predict vs Actual 1400/2807 [' (', [' False']]

Predict vs Actual 1556/2807 [' SUP', [' True']]
Predict vs Actual 1557/2807 [' SUP', [' False']]
Predict vs Actual 1558/2807 ['C', [' True']]
Predict vs Actual 1559/2807 ['', [' False']]
Predict vs Actual 1560/2807 [' RE', [' True']]
Predict vs Actual 1561/2807 [' SUP', [' True']]
Predict vs Actual 1562/2807 [' SUP', [' False']]
Predict vs Actual 1563/2807 [' SUP', [' True']]
Predict vs Actual 1564/2807 ['', [' True']]
Predict vs Actual 1565/2807 [' RE', [' True']]
Predict vs Actual 1566/2807 [' SUP', [' True']]
Predict vs Actual 1567/2807 ['I', [' False']]
Predict vs Actual 1568/2807 [' SUP', [' False']]
Predict vs Actual 1569/2807 [' SUP', [' True']]
Predict vs Actual 1570/2807 [' F', [' False']]
Predict vs Actual 1571/2807 ['', [' False']]
Predict vs Actual 1572/2807 [' P', [' True']]
Predict vs Actual 1573/2807 [' SUP', [' True']]
Predict vs Actual 1574/2807 [' SUP', [' False']]
Predict vs Actual 1575/2807 [' SUP', [' True']]
Predict vs Actual 1576/2807 [' SUP', [' False']]
Predict

Predict vs Actual 1732/2807 [' SUP', [' True']]
Predict vs Actual 1733/2807 [' SUP', [' False']]
Predict vs Actual 1734/2807 [' SUP', [' True']]
Predict vs Actual 1735/2807 ['', [' True']]
Predict vs Actual 1736/2807 [' F', [' True']]
Predict vs Actual 1737/2807 ['', [' True']]
Predict vs Actual 1738/2807 ['', [' True']]
Predict vs Actual 1739/2807 [' SUP', [' False']]
Predict vs Actual 1740/2807 [' __unk__', [' False']]
Predict vs Actual 1741/2807 [' SUP', [' True']]
Predict vs Actual 1742/2807 ['F', [' False']]
Predict vs Actual 1743/2807 ['', [' False']]
Predict vs Actual 1744/2807 [' SUP', [' True']]
Predict vs Actual 1745/2807 [' CONT', [' True']]
Predict vs Actual 1746/2807 [' SUP', [' False']]
Predict vs Actual 1747/2807 [' SUP', [' True']]
Predict vs Actual 1748/2807 [' SUP', [' False']]
Predict vs Actual 1749/2807 [' RE', [' True']]
Predict vs Actual 1750/2807 [' SUP', [' True']]
Predict vs Actual 1751/2807 [' THE', [' True']]
Predict vs Actual 1752/2807 [' SUP', [' False']]
P

Predict vs Actual 1908/2807 [' RE', [' True']]
Predict vs Actual 1909/2807 ['S', [' False']]
Predict vs Actual 1910/2807 [' SUP', [' True']]
Predict vs Actual 1911/2807 ['', [' True']]
Predict vs Actual 1912/2807 ['', [' True']]
Predict vs Actual 1913/2807 [' GUN', [' True']]
Predict vs Actual 1914/2807 [' P', [' False']]
Predict vs Actual 1915/2807 [' PR', [' True']]
Predict vs Actual 1916/2807 ['', [' True']]
Predict vs Actual 1917/2807 [' SUP', [' True']]
Predict vs Actual 1918/2807 [' SUP', [' True']]
Predict vs Actual 1919/2807 ['', [' True']]
Predict vs Actual 1920/2807 ['', [' True']]
Predict vs Actual 1921/2807 [' WIN', [' True']]
Predict vs Actual 1922/2807 [' ST', [' False']]
Predict vs Actual 1923/2807 [' SUP', [' False']]
Predict vs Actual 1924/2807 [' SUP', [' False']]
Predict vs Actual 1925/2807 ['SP', [' True']]
Predict vs Actual 1926/2807 [' SUP', [' False']]
Predict vs Actual 1927/2807 [' SUP', [' True']]
Predict vs Actual 1928/2807 ['', [' True']]
Predict vs Actual 19

Predict vs Actual 2084/2807 [' SUP', [' True']]
Predict vs Actual 2085/2807 [' THE', [' True']]
Predict vs Actual 2086/2807 [' SUP', [' True']]
Predict vs Actual 2087/2807 ['S', [' True']]
Predict vs Actual 2088/2807 [' SUP', [' True']]
Predict vs Actual 2089/2807 [' SUP', [' False']]
Predict vs Actual 2090/2807 [' SUP', [' True']]
Predict vs Actual 2091/2807 ['', [' True']]
Predict vs Actual 2092/2807 [' SUP', [' False']]
Predict vs Actual 2093/2807 ['', [' True']]
Predict vs Actual 2094/2807 [' SUP', [' True']]
Predict vs Actual 2095/2807 ['', [' True']]
Predict vs Actual 2096/2807 [' SUP', [' True']]
Predict vs Actual 2097/2807 [' __ans__', [' False']]
Predict vs Actual 2098/2807 ['', [' True']]
Predict vs Actual 2099/2807 ['', [' False']]
Predict vs Actual 2100/2807 [' SUP', [' True']]
Predict vs Actual 2101/2807 ['', [' True']]
Predict vs Actual 2102/2807 ['OF', [' False']]
Predict vs Actual 2103/2807 [' SUP', [' False']]
Predict vs Actual 2104/2807 [' SUP', [' True']]
Predict vs 

Predict vs Actual 2260/2807 [' DE', [' False']]
Predict vs Actual 2261/2807 ['', [' False']]
Predict vs Actual 2262/2807 [' SUP', [' False']]
Predict vs Actual 2263/2807 [' SUP', [' False']]
Predict vs Actual 2264/2807 ['', [' True']]
Predict vs Actual 2265/2807 [' WE', [' True']]
Predict vs Actual 2266/2807 [' RE', [' False']]
Predict vs Actual 2267/2807 [' D', [' True']]
Predict vs Actual 2268/2807 [' SUP', [' False']]
Predict vs Actual 2269/2807 [' RE', [' True']]
Predict vs Actual 2270/2807 ['F', [' False']]
Predict vs Actual 2271/2807 ['', [' True']]
Predict vs Actual 2272/2807 [' SUP', [' False']]
Predict vs Actual 2273/2807 [' SUP', [' True']]
Predict vs Actual 2274/2807 [' SUP', [' True']]
Predict vs Actual 2275/2807 [' H', [' False']]
Predict vs Actual 2276/2807 [' LIST', [' False']]
Predict vs Actual 2277/2807 [' SUP', [' True']]
Predict vs Actual 2278/2807 ['', [' True']]
Predict vs Actual 2279/2807 [' __scifact__', [' False']]
Predict vs Actual 2280/2807 [' SUP', [' False']

Predict vs Actual 2436/2807 [' __ans__', [' True']]
Predict vs Actual 2437/2807 [' OF', [' False']]
Predict vs Actual 2438/2807 [' RE', [' False']]
Predict vs Actual 2439/2807 ['', [' False']]
Predict vs Actual 2440/2807 [' RE', [' False']]
Predict vs Actual 2441/2807 [' RE', [' True']]
Predict vs Actual 2442/2807 ['', [' False']]
Predict vs Actual 2443/2807 [' SUP', [' True']]
Predict vs Actual 2444/2807 [' SUP', [' True']]
Predict vs Actual 2445/2807 [' SUP', [' True']]
Predict vs Actual 2446/2807 ['', [' True']]
Predict vs Actual 2447/2807 [' __ans__', [' True']]
Predict vs Actual 2448/2807 [' COMP', [' True']]
Predict vs Actual 2449/2807 [' SUP', [' True']]
Predict vs Actual 2450/2807 [' SUP', [' False']]
Predict vs Actual 2451/2807 [' YEAR', [' False']]
Predict vs Actual 2452/2807 [' SUP', [' False']]
Predict vs Actual 2453/2807 ['10', [' True']]
Predict vs Actual 2454/2807 [' SUP', [' True']]
Predict vs Actual 2455/2807 [' SUP', [' True']]
Predict vs Actual 2456/2807 [' RE', [' F

Predict vs Actual 2612/2807 [' of', [' True']]
Predict vs Actual 2613/2807 ['', [' True']]
Predict vs Actual 2614/2807 [' OF', [' True']]
Predict vs Actual 2615/2807 [').', [' True']]
Predict vs Actual 2616/2807 ['', [' False']]
Predict vs Actual 2617/2807 [' S', [' True']]
Predict vs Actual 2618/2807 ['S', [' False']]
Predict vs Actual 2619/2807 [' SUP', [' True']]
Predict vs Actual 2620/2807 [' SUP', [' False']]
Predict vs Actual 2621/2807 [' RE', [' True']]
Predict vs Actual 2622/2807 [' SUP', [' False']]
Predict vs Actual 2623/2807 [' RE', [' False']]
Predict vs Actual 2624/2807 [' SUP', [' True']]
Predict vs Actual 2625/2807 [' SUP', [' True']]
Predict vs Actual 2626/2807 [' SUP', [' False']]
Predict vs Actual 2627/2807 [' END', [' True']]
Predict vs Actual 2628/2807 ['', [' True']]
Predict vs Actual 2629/2807 [' SUP', [' True']]
Predict vs Actual 2630/2807 [' __unk__', [' True']]
Predict vs Actual 2631/2807 [' REC', [' True']]
Predict vs Actual 2632/2807 [' SUP', [' False']]
Pred

Predict vs Actual 2788/2807 ['', [' False']]
Predict vs Actual 2789/2807 ['', [' True']]
Predict vs Actual 2790/2807 ['', [' True']]
Predict vs Actual 2791/2807 [' SUP', [' True']]
Predict vs Actual 2792/2807 ['', [' True']]
Predict vs Actual 2793/2807 ['', [' True']]
Predict vs Actual 2794/2807 [' SUP', [' True']]
Predict vs Actual 2795/2807 ['.', [' False']]
Predict vs Actual 2796/2807 ['', [' True']]
Predict vs Actual 2797/2807 [' SUP', [' False']]
Predict vs Actual 2798/2807 [' SUP', [' False']]
Predict vs Actual 2799/2807 [' RE', [' False']]
Predict vs Actual 2800/2807 ['F', [' True']]
Predict vs Actual 2801/2807 [' IN', [' True']]
Predict vs Actual 2802/2807 [' SUP', [' True']]
Predict vs Actual 2803/2807 ['', [' True']]
Predict vs Actual 2804/2807 [' SUP', [' True']]
Predict vs Actual 2805/2807 [' SUP', [' True']]
Predict vs Actual 2806/2807 ['', [' True']]
{'scifact': OrderedDict([('em', 42.5531914893617), ('nf1', 44.326241134751776), ('nem', 43.61702127659575)]), 'boolq': Orde

Predict vs Actual 160/200 ['', [' NEG']]
Predict vs Actual 161/200 ['', [' NEG']]
Predict vs Actual 162/200 [' STORY', [' NEG']]
Predict vs Actual 163/200 ['', [' NEG']]
Predict vs Actual 164/200 ['', [' NEG']]
Predict vs Actual 165/200 ['', [' NEG']]
Predict vs Actual 166/200 [' SH', [' POS']]
Predict vs Actual 167/200 ['', [' POS']]
Predict vs Actual 168/200 ['', [' POS']]
Predict vs Actual 169/200 ['', [' POS']]
Predict vs Actual 170/200 [' RE', [' POS']]
Predict vs Actual 171/200 [' RE', [' POS']]
Predict vs Actual 172/200 ['', [' POS']]
Predict vs Actual 173/200 ['', [' POS']]
Predict vs Actual 174/200 ['', [' POS']]
Predict vs Actual 175/200 ['', [' POS']]
Predict vs Actual 176/200 [' FIL', [' POS']]
Predict vs Actual 177/200 ['', [' POS']]
Predict vs Actual 178/200 ['', [' POS']]
Predict vs Actual 179/200 ['', [' POS']]
Predict vs Actual 180/200 ['', [' POS']]
Predict vs Actual 181/200 [' RE', [' POS']]
Predict vs Actual 182/200 ['', [' POS']]
Predict vs Actual 183/200 [' RE', [

Predict vs Actual 124/188 [' False', [' SUPPORTS']]
Predict vs Actual 125/188 [' True', [' SUPPORTS']]
Predict vs Actual 126/188 [' True', [' SUPPORTS']]
Predict vs Actual 127/188 [' True', [' REFUTES']]
Predict vs Actual 128/188 [' True', [' SUPPORTS']]
Predict vs Actual 129/188 [' True', [' SUPPORTS']]
Predict vs Actual 130/188 [' True', [' REFUTES']]
Predict vs Actual 131/188 [' False', [' SUPPORTS']]
Predict vs Actual 132/188 [' False', [' REFUTES']]
Predict vs Actual 133/188 [' True', [' SUPPORTS']]
Predict vs Actual 134/188 [' True', [' SUPPORTS']]
Predict vs Actual 135/188 [' True', [' SUPPORTS']]
Predict vs Actual 136/188 [' True', [' SUPPORTS']]
Predict vs Actual 137/188 [' False', [' SUPPORTS']]
Predict vs Actual 138/188 [' True', [' SUPPORTS']]
Predict vs Actual 139/188 [' True', [' REFUTES']]
Predict vs Actual 140/188 [' True', [' REFUTES']]
Predict vs Actual 141/188 [' True', [' SUPPORTS']]
Predict vs Actual 142/188 [' True', [' SUPPORTS']]
Predict vs Actual 143/188 [' Tru

Predict vs Actual 104/2807 [' True', [' True']]
Predict vs Actual 105/2807 [' True', [' True']]
Predict vs Actual 106/2807 [' False', [' False']]
Predict vs Actual 107/2807 [' True', [' True']]
Predict vs Actual 108/2807 [' False', [' True']]
Predict vs Actual 109/2807 [' False', [' True']]
Predict vs Actual 110/2807 [' True', [' False']]
Predict vs Actual 111/2807 [' True', [' False']]
Predict vs Actual 112/2807 [' False', [' True']]
Predict vs Actual 113/2807 [' True', [' True']]
Predict vs Actual 114/2807 [' True', [' False']]
Predict vs Actual 115/2807 [' True', [' True']]
Predict vs Actual 116/2807 [' True', [' True']]
Predict vs Actual 117/2807 [' True', [' False']]
Predict vs Actual 118/2807 [' False', [' False']]
Predict vs Actual 119/2807 [' True', [' False']]
Predict vs Actual 120/2807 [' True', [' False']]
Predict vs Actual 121/2807 [' False', [' False']]
Predict vs Actual 122/2807 [' False', [' True']]
Predict vs Actual 123/2807 [' True', [' True']]
Predict vs Actual 124/28

Predict vs Actual 272/2807 [' False', [' False']]
Predict vs Actual 273/2807 [' True', [' True']]
Predict vs Actual 274/2807 [' True', [' False']]
Predict vs Actual 275/2807 [' True', [' True']]
Predict vs Actual 276/2807 [' True', [' False']]
Predict vs Actual 277/2807 [' True', [' True']]
Predict vs Actual 278/2807 [' True', [' False']]
Predict vs Actual 279/2807 [' False', [' False']]
Predict vs Actual 280/2807 [' False', [' False']]
Predict vs Actual 281/2807 [' True', [' True']]
Predict vs Actual 282/2807 [' True', [' False']]
Predict vs Actual 283/2807 [' False', [' True']]
Predict vs Actual 284/2807 [' False', [' False']]
Predict vs Actual 285/2807 [' True', [' False']]
Predict vs Actual 286/2807 [' True', [' True']]
Predict vs Actual 287/2807 [' True', [' True']]
Predict vs Actual 288/2807 [' True', [' True']]
Predict vs Actual 289/2807 [' True', [' True']]
Predict vs Actual 290/2807 [' False', [' True']]
Predict vs Actual 291/2807 [' True', [' False']]
Predict vs Actual 292/28

Predict vs Actual 444/2807 [' True', [' True']]
Predict vs Actual 445/2807 [' True', [' True']]
Predict vs Actual 446/2807 [' False', [' False']]
Predict vs Actual 447/2807 [' True', [' False']]
Predict vs Actual 448/2807 [' True', [' False']]
Predict vs Actual 449/2807 [' True', [' True']]
Predict vs Actual 450/2807 [' True', [' False']]
Predict vs Actual 451/2807 [' True', [' True']]
Predict vs Actual 452/2807 [' True', [' True']]
Predict vs Actual 453/2807 [' False', [' True']]
Predict vs Actual 454/2807 [' True', [' True']]
Predict vs Actual 455/2807 [' False', [' True']]
Predict vs Actual 456/2807 [' True', [' True']]
Predict vs Actual 457/2807 [' True', [' True']]
Predict vs Actual 458/2807 [' True', [' True']]
Predict vs Actual 459/2807 [' True', [' False']]
Predict vs Actual 460/2807 [' False', [' True']]
Predict vs Actual 461/2807 [' True', [' True']]
Predict vs Actual 462/2807 [' True', [' True']]
Predict vs Actual 463/2807 [' True', [' False']]
Predict vs Actual 464/2807 [' 

Predict vs Actual 616/2807 [' True', [' True']]
Predict vs Actual 617/2807 [' False', [' True']]
Predict vs Actual 618/2807 [' True', [' False']]
Predict vs Actual 619/2807 [' False', [' False']]
Predict vs Actual 620/2807 [' True', [' True']]
Predict vs Actual 621/2807 [' True', [' True']]
Predict vs Actual 622/2807 [' True', [' True']]
Predict vs Actual 623/2807 [' True', [' True']]
Predict vs Actual 624/2807 [' True', [' True']]
Predict vs Actual 625/2807 [' True', [' True']]
Predict vs Actual 626/2807 [' False', [' False']]
Predict vs Actual 627/2807 [' True', [' True']]
Predict vs Actual 628/2807 [' True', [' False']]
Predict vs Actual 629/2807 [' True', [' False']]
Predict vs Actual 630/2807 [' True', [' False']]
Predict vs Actual 631/2807 [' True', [' False']]
Predict vs Actual 632/2807 [' False', [' False']]
Predict vs Actual 633/2807 [' True', [' True']]
Predict vs Actual 634/2807 [' True', [' False']]
Predict vs Actual 635/2807 [' True', [' True']]
Predict vs Actual 636/2807 

Predict vs Actual 788/2807 [' True', [' False']]
Predict vs Actual 789/2807 [' False', [' True']]
Predict vs Actual 790/2807 [' False', [' True']]
Predict vs Actual 791/2807 [' True', [' True']]
Predict vs Actual 792/2807 [' True', [' True']]
Predict vs Actual 793/2807 [' False', [' True']]
Predict vs Actual 794/2807 [' True', [' True']]
Predict vs Actual 795/2807 [' False', [' True']]
Predict vs Actual 796/2807 [' True', [' True']]
Predict vs Actual 797/2807 [' True', [' False']]
Predict vs Actual 798/2807 [' True', [' False']]
Predict vs Actual 799/2807 [' False', [' False']]
Predict vs Actual 800/2807 [' True', [' False']]
Predict vs Actual 801/2807 [' True', [' True']]
Predict vs Actual 802/2807 [' True', [' True']]
Predict vs Actual 803/2807 [' True', [' False']]
Predict vs Actual 804/2807 [' True', [' False']]
Predict vs Actual 805/2807 [' False', [' False']]
Predict vs Actual 806/2807 [' False', [' True']]
Predict vs Actual 807/2807 [' True', [' True']]
Predict vs Actual 808/280

Predict vs Actual 960/2807 [' True', [' False']]
Predict vs Actual 961/2807 [' True', [' True']]
Predict vs Actual 962/2807 [' False', [' False']]
Predict vs Actual 963/2807 [' True', [' True']]
Predict vs Actual 964/2807 [' False', [' True']]
Predict vs Actual 965/2807 [' False', [' False']]
Predict vs Actual 966/2807 [' True', [' True']]
Predict vs Actual 967/2807 [' True', [' True']]
Predict vs Actual 968/2807 [' False', [' True']]
Predict vs Actual 969/2807 [' True', [' False']]
Predict vs Actual 970/2807 [' True', [' True']]
Predict vs Actual 971/2807 [' True', [' True']]
Predict vs Actual 972/2807 [' True', [' False']]
Predict vs Actual 973/2807 [' True', [' True']]
Predict vs Actual 974/2807 [' False', [' False']]
Predict vs Actual 975/2807 [' True', [' True']]
Predict vs Actual 976/2807 [' True', [' True']]
Predict vs Actual 977/2807 [' True', [' True']]
Predict vs Actual 978/2807 [' True', [' True']]
Predict vs Actual 979/2807 [' False', [' False']]
Predict vs Actual 980/2807 

Predict vs Actual 1128/2807 [' True', [' False']]
Predict vs Actual 1129/2807 [' False', [' False']]
Predict vs Actual 1130/2807 [' False', [' True']]
Predict vs Actual 1131/2807 [' True', [' True']]
Predict vs Actual 1132/2807 [' False', [' True']]
Predict vs Actual 1133/2807 [' True', [' False']]
Predict vs Actual 1134/2807 [' True', [' True']]
Predict vs Actual 1135/2807 [' False', [' True']]
Predict vs Actual 1136/2807 [' True', [' False']]
Predict vs Actual 1137/2807 [' True', [' True']]
Predict vs Actual 1138/2807 [' True', [' True']]
Predict vs Actual 1139/2807 [' False', [' True']]
Predict vs Actual 1140/2807 [' True', [' False']]
Predict vs Actual 1141/2807 [' False', [' False']]
Predict vs Actual 1142/2807 [' True', [' True']]
Predict vs Actual 1143/2807 [' True', [' True']]
Predict vs Actual 1144/2807 [' True', [' True']]
Predict vs Actual 1145/2807 [' True', [' True']]
Predict vs Actual 1146/2807 [' False', [' True']]
Predict vs Actual 1147/2807 [' False', [' False']]
Predi

Predict vs Actual 1296/2807 [' False', [' True']]
Predict vs Actual 1297/2807 [' False', [' True']]
Predict vs Actual 1298/2807 [' False', [' False']]
Predict vs Actual 1299/2807 [' True', [' False']]
Predict vs Actual 1300/2807 [' True', [' True']]
Predict vs Actual 1301/2807 [' False', [' False']]
Predict vs Actual 1302/2807 [' False', [' True']]
Predict vs Actual 1303/2807 [' True', [' False']]
Predict vs Actual 1304/2807 [' False', [' True']]
Predict vs Actual 1305/2807 [' True', [' True']]
Predict vs Actual 1306/2807 [' True', [' True']]
Predict vs Actual 1307/2807 [' True', [' False']]
Predict vs Actual 1308/2807 [' True', [' True']]
Predict vs Actual 1309/2807 [' True', [' True']]
Predict vs Actual 1310/2807 [' False', [' True']]
Predict vs Actual 1311/2807 [' True', [' True']]
Predict vs Actual 1312/2807 [' False', [' True']]
Predict vs Actual 1313/2807 [' True', [' True']]
Predict vs Actual 1314/2807 [' True', [' False']]
Predict vs Actual 1315/2807 [' False', [' False']]
Pred

Predict vs Actual 1464/2807 [' True', [' True']]
Predict vs Actual 1465/2807 [' True', [' False']]
Predict vs Actual 1466/2807 [' True', [' True']]
Predict vs Actual 1467/2807 [' True', [' False']]
Predict vs Actual 1468/2807 [' True', [' False']]
Predict vs Actual 1469/2807 [' True', [' True']]
Predict vs Actual 1470/2807 [' True', [' False']]
Predict vs Actual 1471/2807 [' True', [' True']]
Predict vs Actual 1472/2807 [' True', [' True']]
Predict vs Actual 1473/2807 [' False', [' True']]
Predict vs Actual 1474/2807 [' False', [' True']]
Predict vs Actual 1475/2807 [' False', [' True']]
Predict vs Actual 1476/2807 [' False', [' False']]
Predict vs Actual 1477/2807 [' True', [' True']]
Predict vs Actual 1478/2807 [' False', [' True']]
Predict vs Actual 1479/2807 [' True', [' True']]
Predict vs Actual 1480/2807 [' False', [' True']]
Predict vs Actual 1481/2807 [' True', [' True']]
Predict vs Actual 1482/2807 [' True', [' True']]
Predict vs Actual 1483/2807 [' False', [' True']]
Predict 

Predict vs Actual 1632/2807 [' False', [' True']]
Predict vs Actual 1633/2807 [' True', [' True']]
Predict vs Actual 1634/2807 [' True', [' True']]
Predict vs Actual 1635/2807 [' True', [' True']]
Predict vs Actual 1636/2807 [' False', [' True']]
Predict vs Actual 1637/2807 [' True', [' True']]
Predict vs Actual 1638/2807 [' True', [' True']]
Predict vs Actual 1639/2807 [' True', [' True']]
Predict vs Actual 1640/2807 [' True', [' True']]
Predict vs Actual 1641/2807 [' False', [' True']]
Predict vs Actual 1642/2807 [' True', [' False']]
Predict vs Actual 1643/2807 [' False', [' False']]
Predict vs Actual 1644/2807 [' True', [' True']]
Predict vs Actual 1645/2807 [' True', [' False']]
Predict vs Actual 1646/2807 [' True', [' False']]
Predict vs Actual 1647/2807 [' True', [' True']]
Predict vs Actual 1648/2807 [' True', [' True']]
Predict vs Actual 1649/2807 [' False', [' False']]
Predict vs Actual 1650/2807 [' True', [' True']]
Predict vs Actual 1651/2807 [' False', [' False']]
Predict 

Predict vs Actual 1800/2807 [' False', [' True']]
Predict vs Actual 1801/2807 [' True', [' True']]
Predict vs Actual 1802/2807 [' True', [' False']]
Predict vs Actual 1803/2807 [' True', [' True']]
Predict vs Actual 1804/2807 [' False', [' True']]
Predict vs Actual 1805/2807 [' True', [' True']]
Predict vs Actual 1806/2807 [' False', [' False']]
Predict vs Actual 1807/2807 [' True', [' True']]
Predict vs Actual 1808/2807 [' True', [' True']]
Predict vs Actual 1809/2807 [' False', [' True']]
Predict vs Actual 1810/2807 [' True', [' False']]
Predict vs Actual 1811/2807 [' True', [' False']]
Predict vs Actual 1812/2807 [' True', [' True']]
Predict vs Actual 1813/2807 [' False', [' True']]
Predict vs Actual 1814/2807 [' False', [' True']]
Predict vs Actual 1815/2807 [' True', [' False']]
Predict vs Actual 1816/2807 [' False', [' False']]
Predict vs Actual 1817/2807 [' True', [' False']]
Predict vs Actual 1818/2807 [' False', [' True']]
Predict vs Actual 1819/2807 [' True', [' True']]
Predi

Predict vs Actual 1968/2807 [' False', [' True']]
Predict vs Actual 1969/2807 [' True', [' True']]
Predict vs Actual 1970/2807 [' False', [' False']]
Predict vs Actual 1971/2807 [' True', [' True']]
Predict vs Actual 1972/2807 [' False', [' False']]
Predict vs Actual 1973/2807 [' True', [' True']]
Predict vs Actual 1974/2807 [' False', [' True']]
Predict vs Actual 1975/2807 [' True', [' True']]
Predict vs Actual 1976/2807 [' True', [' True']]
Predict vs Actual 1977/2807 [' True', [' True']]
Predict vs Actual 1978/2807 [' True', [' True']]
Predict vs Actual 1979/2807 [' True', [' True']]
Predict vs Actual 1980/2807 [' True', [' False']]
Predict vs Actual 1981/2807 [' False', [' True']]
Predict vs Actual 1982/2807 [' False', [' False']]
Predict vs Actual 1983/2807 [' True', [' False']]
Predict vs Actual 1984/2807 [' True', [' False']]
Predict vs Actual 1985/2807 [' True', [' True']]
Predict vs Actual 1986/2807 [' True', [' True']]
Predict vs Actual 1987/2807 [' False', [' False']]
Predic

Predict vs Actual 2136/2807 [' True', [' True']]
Predict vs Actual 2137/2807 [' True', [' True']]
Predict vs Actual 2138/2807 [' True', [' True']]
Predict vs Actual 2139/2807 [' False', [' True']]
Predict vs Actual 2140/2807 [' False', [' False']]
Predict vs Actual 2141/2807 [' True', [' True']]
Predict vs Actual 2142/2807 [' True', [' False']]
Predict vs Actual 2143/2807 [' False', [' False']]
Predict vs Actual 2144/2807 [' False', [' True']]
Predict vs Actual 2145/2807 [' True', [' False']]
Predict vs Actual 2146/2807 [' True', [' False']]
Predict vs Actual 2147/2807 [' True', [' True']]
Predict vs Actual 2148/2807 [' False', [' False']]
Predict vs Actual 2149/2807 [' True', [' True']]
Predict vs Actual 2150/2807 [' True', [' True']]
Predict vs Actual 2151/2807 [' True', [' False']]
Predict vs Actual 2152/2807 [' False', [' True']]
Predict vs Actual 2153/2807 [' True', [' False']]
Predict vs Actual 2154/2807 [' False', [' False']]
Predict vs Actual 2155/2807 [' True', [' False']]
Pre

Predict vs Actual 2304/2807 [' True', [' False']]
Predict vs Actual 2305/2807 [' True', [' False']]
Predict vs Actual 2306/2807 [' True', [' True']]
Predict vs Actual 2307/2807 [' False', [' False']]
Predict vs Actual 2308/2807 [' True', [' False']]
Predict vs Actual 2309/2807 [' False', [' True']]
Predict vs Actual 2310/2807 [' True', [' False']]
Predict vs Actual 2311/2807 [' True', [' True']]
Predict vs Actual 2312/2807 [' False', [' True']]
Predict vs Actual 2313/2807 [' False', [' False']]
Predict vs Actual 2314/2807 [' True', [' False']]
Predict vs Actual 2315/2807 [' False', [' True']]
Predict vs Actual 2316/2807 [' True', [' True']]
Predict vs Actual 2317/2807 [' False', [' False']]
Predict vs Actual 2318/2807 [' True', [' False']]
Predict vs Actual 2319/2807 [' True', [' False']]
Predict vs Actual 2320/2807 [' True', [' False']]
Predict vs Actual 2321/2807 [' True', [' False']]
Predict vs Actual 2322/2807 [' True', [' True']]
Predict vs Actual 2323/2807 [' True', [' True']]
Pr

Predict vs Actual 2472/2807 [' False', [' True']]
Predict vs Actual 2473/2807 [' True', [' True']]
Predict vs Actual 2474/2807 [' False', [' True']]
Predict vs Actual 2475/2807 [' True', [' True']]
Predict vs Actual 2476/2807 [' True', [' False']]
Predict vs Actual 2477/2807 [' False', [' True']]
Predict vs Actual 2478/2807 [' True', [' True']]
Predict vs Actual 2479/2807 [' True', [' True']]
Predict vs Actual 2480/2807 [' True', [' True']]
Predict vs Actual 2481/2807 [' True', [' False']]
Predict vs Actual 2482/2807 [' True', [' False']]
Predict vs Actual 2483/2807 [' True', [' True']]
Predict vs Actual 2484/2807 [' True', [' True']]
Predict vs Actual 2485/2807 [' True', [' True']]
Predict vs Actual 2486/2807 [' True', [' True']]
Predict vs Actual 2487/2807 [' False', [' False']]
Predict vs Actual 2488/2807 [' True', [' False']]
Predict vs Actual 2489/2807 [' True', [' True']]
Predict vs Actual 2490/2807 [' True', [' False']]
Predict vs Actual 2491/2807 [' True', [' False']]
Predict v

Predict vs Actual 2640/2807 [' True', [' True']]
Predict vs Actual 2641/2807 [' True', [' True']]
Predict vs Actual 2642/2807 [' False', [' False']]
Predict vs Actual 2643/2807 [' True', [' False']]
Predict vs Actual 2644/2807 [' True', [' True']]
Predict vs Actual 2645/2807 [' False', [' True']]
Predict vs Actual 2646/2807 [' True', [' True']]
Predict vs Actual 2647/2807 [' True', [' True']]
Predict vs Actual 2648/2807 [' True', [' False']]
Predict vs Actual 2649/2807 [' True', [' False']]
Predict vs Actual 2650/2807 [' True', [' False']]
Predict vs Actual 2651/2807 [' True', [' False']]
Predict vs Actual 2652/2807 [' True', [' False']]
Predict vs Actual 2653/2807 [' True', [' True']]
Predict vs Actual 2654/2807 [' False', [' False']]
Predict vs Actual 2655/2807 [' True', [' True']]
Predict vs Actual 2656/2807 [' False', [' True']]
Predict vs Actual 2657/2807 [' True', [' True']]
Predict vs Actual 2658/2807 [' True', [' True']]
Predict vs Actual 2659/2807 [' True', [' True']]
Predict 

{'scifact': OrderedDict([('em', 0.0), ('nf1', 0.0), ('nem', 0.0)]), 'boolq': OrderedDict([('em', 54.82721767011044), ('nf1', 54.82721767011044), ('nem', 54.82721767011044)]), 'movie': None}
start to test { task: boolq (load) movie (eval)}
len of test dataset: 200
Predict vs Actual 0/200 [' True', [' NEG']]
Predict vs Actual 1/200 [' True', [' POS']]
Predict vs Actual 2/200 [' True', [' NEG']]
Predict vs Actual 3/200 [' False', [' POS']]
Predict vs Actual 4/200 [' True', [' NEG']]
Predict vs Actual 5/200 [' True', [' POS']]
Predict vs Actual 6/200 [' False', [' NEG']]
Predict vs Actual 7/200 [' True', [' NEG']]
Predict vs Actual 8/200 [' False', [' NEG']]
Predict vs Actual 9/200 [' False', [' POS']]
Predict vs Actual 10/200 [' True', [' NEG']]
Predict vs Actual 11/200 [' True', [' NEG']]
Predict vs Actual 12/200 [' True', [' NEG']]
Predict vs Actual 13/200 [' True', [' NEG']]
Predict vs Actual 14/200 [' True', [' POS']]
Predict vs Actual 15/200 [' False', [' POS']]
Predict vs Actual 16/

Predict vs Actual 176/200 [' True', [' POS']]
Predict vs Actual 177/200 [' True', [' POS']]
Predict vs Actual 178/200 [' True', [' POS']]
Predict vs Actual 179/200 [' True', [' POS']]
Predict vs Actual 180/200 [' True', [' POS']]
Predict vs Actual 181/200 [' True', [' POS']]
Predict vs Actual 182/200 [' True', [' POS']]
Predict vs Actual 183/200 [' False', [' POS']]
Predict vs Actual 184/200 [' False', [' POS']]
Predict vs Actual 185/200 [' True', [' POS']]
Predict vs Actual 186/200 [' True', [' POS']]
Predict vs Actual 187/200 [' True', [' POS']]
Predict vs Actual 188/200 [' True', [' POS']]
Predict vs Actual 189/200 [' False', [' POS']]
Predict vs Actual 190/200 [' False', [' POS']]
Predict vs Actual 191/200 [' False', [' POS']]
Predict vs Actual 192/200 [' True', [' POS']]
Predict vs Actual 193/200 [' False', [' POS']]
Predict vs Actual 194/200 [' True', [' POS']]
Predict vs Actual 195/200 [' False', [' POS']]
Predict vs Actual 196/200 [' False', [' POS']]
Predict vs Actual 197/200 

Predict vs Actual 140/188 [' POS', [' REFUTES']]
Predict vs Actual 141/188 [' POS', [' SUPPORTS']]
Predict vs Actual 142/188 [' POS', [' SUPPORTS']]
Predict vs Actual 143/188 [' POS', [' SUPPORTS']]
Predict vs Actual 144/188 [' POS', [' REFUTES']]
Predict vs Actual 145/188 [' POS', [' REFUTES']]
Predict vs Actual 146/188 [' POS', [' SUPPORTS']]
Predict vs Actual 147/188 [' POS', [' SUPPORTS']]
Predict vs Actual 148/188 [' POS', [' SUPPORTS']]
Predict vs Actual 149/188 [' POS', [' SUPPORTS']]
Predict vs Actual 150/188 [' POS', [' REFUTES']]
Predict vs Actual 151/188 [' POS', [' REFUTES']]
Predict vs Actual 152/188 [' POS', [' SUPPORTS']]
Predict vs Actual 153/188 [' POS', [' SUPPORTS']]
Predict vs Actual 154/188 [' POS', [' REFUTES']]
Predict vs Actual 155/188 [' POS', [' SUPPORTS']]
Predict vs Actual 156/188 [' POS', [' SUPPORTS']]
Predict vs Actual 157/188 [' POS', [' REFUTES']]
Predict vs Actual 158/188 [' POS', [' SUPPORTS']]
Predict vs Actual 159/188 [' POS', [' SUPPORTS']]
Predict

Predict vs Actual 124/2807 [' POS', [' True']]
Predict vs Actual 125/2807 [' POS', [' True']]
Predict vs Actual 126/2807 [' POS', [' True']]
Predict vs Actual 127/2807 [' POS', [' False']]
Predict vs Actual 128/2807 [' POS', [' True']]
Predict vs Actual 129/2807 [' POS', [' False']]
Predict vs Actual 130/2807 [' POS', [' True']]
Predict vs Actual 131/2807 [' POS', [' True']]
Predict vs Actual 132/2807 [' POS', [' True']]
Predict vs Actual 133/2807 [' POS', [' True']]
Predict vs Actual 134/2807 [' POS', [' False']]
Predict vs Actual 135/2807 [' POS', [' False']]
Predict vs Actual 136/2807 [' POS', [' True']]
Predict vs Actual 137/2807 [' POS', [' True']]
Predict vs Actual 138/2807 [' POS', [' True']]
Predict vs Actual 139/2807 [' POS', [' True']]
Predict vs Actual 140/2807 [' POS', [' True']]
Predict vs Actual 141/2807 [' POS', [' True']]
Predict vs Actual 142/2807 [' POS', [' False']]
Predict vs Actual 143/2807 [' POS', [' True']]
Predict vs Actual 144/2807 [' POS', [' False']]
Predict

Predict vs Actual 300/2807 [' POS', [' False']]
Predict vs Actual 301/2807 [' POS', [' False']]
Predict vs Actual 302/2807 [' POS', [' False']]
Predict vs Actual 303/2807 [' POS', [' False']]
Predict vs Actual 304/2807 [' POS', [' False']]
Predict vs Actual 305/2807 [' POS', [' False']]
Predict vs Actual 306/2807 [' POS', [' True']]
Predict vs Actual 307/2807 [' POS', [' True']]
Predict vs Actual 308/2807 [' POS', [' False']]
Predict vs Actual 309/2807 [' POS', [' True']]
Predict vs Actual 310/2807 [' POS', [' True']]
Predict vs Actual 311/2807 [' POS', [' True']]
Predict vs Actual 312/2807 [' POS', [' True']]
Predict vs Actual 313/2807 [' POS', [' True']]
Predict vs Actual 314/2807 [' POS', [' True']]
Predict vs Actual 315/2807 [' POS', [' True']]
Predict vs Actual 316/2807 [' POS', [' False']]
Predict vs Actual 317/2807 [' POS', [' False']]
Predict vs Actual 318/2807 [' POS', [' True']]
Predict vs Actual 319/2807 [' POS', [' False']]
Predict vs Actual 320/2807 [' POS', [' False']]
Pr

Predict vs Actual 476/2807 [' POS', [' True']]
Predict vs Actual 477/2807 [' POS', [' True']]
Predict vs Actual 478/2807 [' POS', [' True']]
Predict vs Actual 479/2807 [' POS', [' True']]
Predict vs Actual 480/2807 [' POS', [' False']]
Predict vs Actual 481/2807 [' POS', [' True']]
Predict vs Actual 482/2807 [' POS', [' True']]
Predict vs Actual 483/2807 [' POS', [' True']]
Predict vs Actual 484/2807 [' POS', [' False']]
Predict vs Actual 485/2807 [' POS', [' False']]
Predict vs Actual 486/2807 [' POS', [' True']]
Predict vs Actual 487/2807 [' POS', [' True']]
Predict vs Actual 488/2807 [' POS', [' False']]
Predict vs Actual 489/2807 [' POS', [' True']]
Predict vs Actual 490/2807 [' POS', [' False']]
Predict vs Actual 491/2807 [' POS', [' False']]
Predict vs Actual 492/2807 [' POS', [' True']]
Predict vs Actual 493/2807 [' POS', [' True']]
Predict vs Actual 494/2807 [' POS', [' True']]
Predict vs Actual 495/2807 [' POS', [' True']]
Predict vs Actual 496/2807 [' POS', [' False']]
Predic

Predict vs Actual 652/2807 [' POS', [' True']]
Predict vs Actual 653/2807 [' POS', [' False']]
Predict vs Actual 654/2807 [' POS', [' True']]
Predict vs Actual 655/2807 [' POS', [' True']]
Predict vs Actual 656/2807 [' POS', [' True']]
Predict vs Actual 657/2807 [' POS', [' False']]
Predict vs Actual 658/2807 [' POS', [' True']]
Predict vs Actual 659/2807 [' POS', [' True']]
Predict vs Actual 660/2807 [' POS', [' False']]
Predict vs Actual 661/2807 [' POS', [' False']]
Predict vs Actual 662/2807 [' POS', [' True']]
Predict vs Actual 663/2807 [' POS', [' False']]
Predict vs Actual 664/2807 [' POS', [' False']]
Predict vs Actual 665/2807 [' POS', [' True']]
Predict vs Actual 666/2807 [' POS', [' True']]
Predict vs Actual 667/2807 [' POS', [' False']]
Predict vs Actual 668/2807 [' POS', [' True']]
Predict vs Actual 669/2807 [' POS', [' True']]
Predict vs Actual 670/2807 [' POS', [' False']]
Predict vs Actual 671/2807 [' POS', [' False']]
Predict vs Actual 672/2807 [' POS', [' False']]
Pre

Predict vs Actual 828/2807 [' POS', [' False']]
Predict vs Actual 829/2807 [' POS', [' True']]
Predict vs Actual 830/2807 [' POS', [' True']]
Predict vs Actual 831/2807 [' POS', [' True']]
Predict vs Actual 832/2807 [' POS', [' True']]
Predict vs Actual 833/2807 [' POS', [' True']]
Predict vs Actual 834/2807 [' POS', [' False']]
Predict vs Actual 835/2807 [' POS', [' True']]
Predict vs Actual 836/2807 [' POS', [' False']]
Predict vs Actual 837/2807 [' POS', [' False']]
Predict vs Actual 838/2807 [' POS', [' False']]
Predict vs Actual 839/2807 [' POS', [' False']]
Predict vs Actual 840/2807 [' POS', [' True']]
Predict vs Actual 841/2807 [' POS', [' True']]
Predict vs Actual 842/2807 [' POS', [' False']]
Predict vs Actual 843/2807 [' POS', [' True']]
Predict vs Actual 844/2807 [' POS', [' False']]
Predict vs Actual 845/2807 [' POS', [' False']]
Predict vs Actual 846/2807 [' POS', [' True']]
Predict vs Actual 847/2807 [' POS', [' True']]
Predict vs Actual 848/2807 [' POS', [' False']]
Pre

Predict vs Actual 1004/2807 [' POS', [' True']]
Predict vs Actual 1005/2807 [' POS', [' False']]
Predict vs Actual 1006/2807 [' POS', [' True']]
Predict vs Actual 1007/2807 [' POS', [' True']]
Predict vs Actual 1008/2807 [' POS', [' True']]
Predict vs Actual 1009/2807 [' POS', [' True']]
Predict vs Actual 1010/2807 [' POS', [' True']]
Predict vs Actual 1011/2807 [' POS', [' True']]
Predict vs Actual 1012/2807 [' POS', [' False']]
Predict vs Actual 1013/2807 [' POS', [' True']]
Predict vs Actual 1014/2807 [' POS', [' True']]
Predict vs Actual 1015/2807 [' POS', [' False']]
Predict vs Actual 1016/2807 [' POS', [' True']]
Predict vs Actual 1017/2807 [' POS', [' False']]
Predict vs Actual 1018/2807 [' POS', [' False']]
Predict vs Actual 1019/2807 [' POS', [' True']]
Predict vs Actual 1020/2807 [' POS', [' True']]
Predict vs Actual 1021/2807 [' POS', [' True']]
Predict vs Actual 1022/2807 [' POS', [' True']]
Predict vs Actual 1023/2807 [' POS', [' True']]
Predict vs Actual 1024/2807 [' POS'

Predict vs Actual 1176/2807 [' POS', [' False']]
Predict vs Actual 1177/2807 [' POS', [' True']]
Predict vs Actual 1178/2807 [' POS', [' True']]
Predict vs Actual 1179/2807 [' POS', [' True']]
Predict vs Actual 1180/2807 [' POS', [' True']]
Predict vs Actual 1181/2807 [' POS', [' True']]
Predict vs Actual 1182/2807 [' POS', [' True']]
Predict vs Actual 1183/2807 [' POS', [' True']]
Predict vs Actual 1184/2807 [' POS', [' True']]
Predict vs Actual 1185/2807 [' POS', [' False']]
Predict vs Actual 1186/2807 [' POS', [' True']]
Predict vs Actual 1187/2807 [' POS', [' False']]
Predict vs Actual 1188/2807 [' POS', [' False']]
Predict vs Actual 1189/2807 [' POS', [' True']]
Predict vs Actual 1190/2807 [' POS', [' False']]
Predict vs Actual 1191/2807 [' POS', [' True']]
Predict vs Actual 1192/2807 [' POS', [' True']]
Predict vs Actual 1193/2807 [' POS', [' True']]
Predict vs Actual 1194/2807 [' POS', [' False']]
Predict vs Actual 1195/2807 [' POS', [' True']]
Predict vs Actual 1196/2807 [' POS

Predict vs Actual 1348/2807 [' POS', [' True']]
Predict vs Actual 1349/2807 [' POS', [' False']]
Predict vs Actual 1350/2807 [' POS', [' True']]
Predict vs Actual 1351/2807 [' POS', [' False']]
Predict vs Actual 1352/2807 [' POS', [' False']]
Predict vs Actual 1353/2807 [' POS', [' True']]
Predict vs Actual 1354/2807 [' POS', [' False']]
Predict vs Actual 1355/2807 [' POS', [' False']]
Predict vs Actual 1356/2807 [' POS', [' False']]
Predict vs Actual 1357/2807 [' POS', [' False']]
Predict vs Actual 1358/2807 [' POS', [' False']]
Predict vs Actual 1359/2807 [' POS', [' False']]
Predict vs Actual 1360/2807 [' POS', [' False']]
Predict vs Actual 1361/2807 [' POS', [' True']]
Predict vs Actual 1362/2807 [' POS', [' True']]
Predict vs Actual 1363/2807 [' POS', [' False']]
Predict vs Actual 1364/2807 [' POS', [' True']]
Predict vs Actual 1365/2807 [' POS', [' False']]
Predict vs Actual 1366/2807 [' POS', [' True']]
Predict vs Actual 1367/2807 [' POS', [' True']]
Predict vs Actual 1368/2807 

Predict vs Actual 1520/2807 [' POS', [' True']]
Predict vs Actual 1521/2807 [' POS', [' True']]
Predict vs Actual 1522/2807 [' POS', [' True']]
Predict vs Actual 1523/2807 [' POS', [' True']]
Predict vs Actual 1524/2807 [' POS', [' False']]
Predict vs Actual 1525/2807 [' POS', [' False']]
Predict vs Actual 1526/2807 [' POS', [' False']]
Predict vs Actual 1527/2807 [' POS', [' True']]
Predict vs Actual 1528/2807 [' POS', [' False']]
Predict vs Actual 1529/2807 [' POS', [' True']]
Predict vs Actual 1530/2807 [' POS', [' True']]
Predict vs Actual 1531/2807 [' POS', [' True']]
Predict vs Actual 1532/2807 [' POS', [' True']]
Predict vs Actual 1533/2807 [' POS', [' True']]
Predict vs Actual 1534/2807 [' POS', [' False']]
Predict vs Actual 1535/2807 [' POS', [' True']]
Predict vs Actual 1536/2807 [' POS', [' False']]
Predict vs Actual 1537/2807 [' POS', [' False']]
Predict vs Actual 1538/2807 [' POS', [' False']]
Predict vs Actual 1539/2807 [' POS', [' True']]
Predict vs Actual 1540/2807 [' P

Predict vs Actual 1692/2807 [' POS', [' False']]
Predict vs Actual 1693/2807 [' POS', [' False']]
Predict vs Actual 1694/2807 [' POS', [' False']]
Predict vs Actual 1695/2807 [' POS', [' True']]
Predict vs Actual 1696/2807 [' POS', [' False']]
Predict vs Actual 1697/2807 [' POS', [' False']]
Predict vs Actual 1698/2807 [' POS', [' True']]
Predict vs Actual 1699/2807 [' POS', [' True']]
Predict vs Actual 1700/2807 [' POS', [' False']]
Predict vs Actual 1701/2807 [' POS', [' True']]
Predict vs Actual 1702/2807 [' POS', [' True']]
Predict vs Actual 1703/2807 [' POS', [' True']]
Predict vs Actual 1704/2807 [' POS', [' True']]
Predict vs Actual 1705/2807 [' POS', [' True']]
Predict vs Actual 1706/2807 [' POS', [' True']]
Predict vs Actual 1707/2807 [' POS', [' True']]
Predict vs Actual 1708/2807 [' POS', [' True']]
Predict vs Actual 1709/2807 [' POS', [' True']]
Predict vs Actual 1710/2807 [' POS', [' True']]
Predict vs Actual 1711/2807 [' POS', [' True']]
Predict vs Actual 1712/2807 [' POS

Predict vs Actual 1864/2807 [' POS', [' True']]
Predict vs Actual 1865/2807 [' POS', [' True']]
Predict vs Actual 1866/2807 [' POS', [' False']]
Predict vs Actual 1867/2807 [' POS', [' False']]
Predict vs Actual 1868/2807 [' POS', [' True']]
Predict vs Actual 1869/2807 [' POS', [' True']]
Predict vs Actual 1870/2807 [' POS', [' False']]
Predict vs Actual 1871/2807 [' POS', [' True']]
Predict vs Actual 1872/2807 [' POS', [' False']]
Predict vs Actual 1873/2807 [' POS', [' False']]
Predict vs Actual 1874/2807 [' POS', [' False']]
Predict vs Actual 1875/2807 [' POS', [' True']]
Predict vs Actual 1876/2807 [' POS', [' True']]
Predict vs Actual 1877/2807 [' POS', [' False']]
Predict vs Actual 1878/2807 [' POS', [' True']]
Predict vs Actual 1879/2807 [' POS', [' True']]
Predict vs Actual 1880/2807 [' POS', [' False']]
Predict vs Actual 1881/2807 [' POS', [' True']]
Predict vs Actual 1882/2807 [' POS', [' False']]
Predict vs Actual 1883/2807 [' POS', [' True']]
Predict vs Actual 1884/2807 [' 

Predict vs Actual 2036/2807 [' POS', [' True']]
Predict vs Actual 2037/2807 [' POS', [' False']]
Predict vs Actual 2038/2807 [' POS', [' False']]
Predict vs Actual 2039/2807 [' POS', [' True']]
Predict vs Actual 2040/2807 [' POS', [' False']]
Predict vs Actual 2041/2807 [' POS', [' True']]
Predict vs Actual 2042/2807 [' POS', [' False']]
Predict vs Actual 2043/2807 [' POS', [' False']]
Predict vs Actual 2044/2807 [' POS', [' False']]
Predict vs Actual 2045/2807 [' POS', [' True']]
Predict vs Actual 2046/2807 [' POS', [' True']]
Predict vs Actual 2047/2807 [' POS', [' False']]
Predict vs Actual 2048/2807 [' POS', [' True']]
Predict vs Actual 2049/2807 [' POS', [' True']]
Predict vs Actual 2050/2807 [' POS', [' False']]
Predict vs Actual 2051/2807 [' POS', [' True']]
Predict vs Actual 2052/2807 [' POS', [' False']]
Predict vs Actual 2053/2807 [' POS', [' True']]
Predict vs Actual 2054/2807 [' POS', [' False']]
Predict vs Actual 2055/2807 [' POS', [' True']]
Predict vs Actual 2056/2807 ['

Predict vs Actual 2208/2807 [' POS', [' True']]
Predict vs Actual 2209/2807 [' POS', [' True']]
Predict vs Actual 2210/2807 [' POS', [' True']]
Predict vs Actual 2211/2807 [' POS', [' False']]
Predict vs Actual 2212/2807 [' POS', [' True']]
Predict vs Actual 2213/2807 [' POS', [' False']]
Predict vs Actual 2214/2807 [' POS', [' False']]
Predict vs Actual 2215/2807 [' POS', [' True']]
Predict vs Actual 2216/2807 [' POS', [' False']]
Predict vs Actual 2217/2807 [' POS', [' False']]
Predict vs Actual 2218/2807 [' POS', [' False']]
Predict vs Actual 2219/2807 [' POS', [' False']]
Predict vs Actual 2220/2807 [' POS', [' True']]
Predict vs Actual 2221/2807 [' POS', [' False']]
Predict vs Actual 2222/2807 [' POS', [' False']]
Predict vs Actual 2223/2807 [' POS', [' False']]
Predict vs Actual 2224/2807 [' POS', [' False']]
Predict vs Actual 2225/2807 [' POS', [' True']]
Predict vs Actual 2226/2807 [' POS', [' False']]
Predict vs Actual 2227/2807 [' POS', [' True']]
Predict vs Actual 2228/2807 

Predict vs Actual 2380/2807 [' POS', [' True']]
Predict vs Actual 2381/2807 [' POS', [' False']]
Predict vs Actual 2382/2807 [' POS', [' True']]
Predict vs Actual 2383/2807 [' POS', [' True']]
Predict vs Actual 2384/2807 [' POS', [' True']]
Predict vs Actual 2385/2807 [' POS', [' False']]
Predict vs Actual 2386/2807 [' POS', [' False']]
Predict vs Actual 2387/2807 [' POS', [' False']]
Predict vs Actual 2388/2807 [' POS', [' True']]
Predict vs Actual 2389/2807 [' POS', [' False']]
Predict vs Actual 2390/2807 [' POS', [' True']]
Predict vs Actual 2391/2807 [' POS', [' False']]
Predict vs Actual 2392/2807 [' POS', [' False']]
Predict vs Actual 2393/2807 [' POS', [' True']]
Predict vs Actual 2394/2807 [' POS', [' True']]
Predict vs Actual 2395/2807 [' POS', [' True']]
Predict vs Actual 2396/2807 [' POS', [' True']]
Predict vs Actual 2397/2807 [' POS', [' True']]
Predict vs Actual 2398/2807 [' POS', [' False']]
Predict vs Actual 2399/2807 [' POS', [' False']]
Predict vs Actual 2400/2807 [' 

Predict vs Actual 2552/2807 [' POS', [' False']]
Predict vs Actual 2553/2807 [' POS', [' False']]
Predict vs Actual 2554/2807 [' POS', [' False']]
Predict vs Actual 2555/2807 [' POS', [' True']]
Predict vs Actual 2556/2807 [' POS', [' True']]
Predict vs Actual 2557/2807 [' POS', [' True']]
Predict vs Actual 2558/2807 [' POS', [' False']]
Predict vs Actual 2559/2807 [' POS', [' True']]
Predict vs Actual 2560/2807 [' POS', [' False']]
Predict vs Actual 2561/2807 [' POS', [' True']]
Predict vs Actual 2562/2807 [' POS', [' True']]
Predict vs Actual 2563/2807 [' POS', [' True']]
Predict vs Actual 2564/2807 [' POS', [' True']]
Predict vs Actual 2565/2807 [' POS', [' True']]
Predict vs Actual 2566/2807 [' POS', [' True']]
Predict vs Actual 2567/2807 [' POS', [' False']]
Predict vs Actual 2568/2807 [' POS', [' False']]
Predict vs Actual 2569/2807 [' POS', [' False']]
Predict vs Actual 2570/2807 [' POS', [' True']]
Predict vs Actual 2571/2807 [' POS', [' True']]
Predict vs Actual 2572/2807 [' P

Predict vs Actual 2724/2807 [' POS', [' False']]
Predict vs Actual 2725/2807 [' POS', [' False']]
Predict vs Actual 2726/2807 [' POS', [' True']]
Predict vs Actual 2727/2807 [' POS', [' False']]
Predict vs Actual 2728/2807 [' POS', [' True']]
Predict vs Actual 2729/2807 [' POS', [' False']]
Predict vs Actual 2730/2807 [' POS', [' True']]
Predict vs Actual 2731/2807 [' POS', [' True']]
Predict vs Actual 2732/2807 [' POS', [' True']]
Predict vs Actual 2733/2807 [' POS', [' True']]
Predict vs Actual 2734/2807 [' POS', [' False']]
Predict vs Actual 2735/2807 [' POS', [' True']]
Predict vs Actual 2736/2807 [' POS', [' False']]
Predict vs Actual 2737/2807 [' POS', [' False']]
Predict vs Actual 2738/2807 [' POS', [' False']]
Predict vs Actual 2739/2807 [' POS', [' True']]
Predict vs Actual 2740/2807 [' POS', [' True']]
Predict vs Actual 2741/2807 [' POS', [' True']]
Predict vs Actual 2742/2807 [' POS', [' True']]
Predict vs Actual 2743/2807 [' POS', [' True']]
Predict vs Actual 2744/2807 [' P

Predict vs Actual 92/200 [' POS', [' NEG']]
Predict vs Actual 93/200 [' POS', [' POS']]
Predict vs Actual 94/200 [' POS', [' NEG']]
Predict vs Actual 95/200 [' POS', [' NEG']]
Predict vs Actual 96/200 [' POS', [' NEG']]
Predict vs Actual 97/200 [' POS', [' POS']]
Predict vs Actual 98/200 [' POS', [' POS']]
Predict vs Actual 99/200 [' POS', [' POS']]
Predict vs Actual 100/200 [' POS', [' POS']]
Predict vs Actual 101/200 [' POS', [' POS']]
Predict vs Actual 102/200 [' POS', [' NEG']]
Predict vs Actual 103/200 [' POS', [' NEG']]
Predict vs Actual 104/200 [' POS', [' POS']]
Predict vs Actual 105/200 [' POS', [' NEG']]
Predict vs Actual 106/200 [' POS', [' POS']]
Predict vs Actual 107/200 [' POS', [' NEG']]
Predict vs Actual 108/200 [' POS', [' POS']]
Predict vs Actual 109/200 [' POS', [' NEG']]
Predict vs Actual 110/200 [' POS', [' NEG']]
Predict vs Actual 111/200 [' POS', [' NEG']]
Predict vs Actual 112/200 [' POS', [' POS']]
Predict vs Actual 113/200 [' POS', [' POS']]
Predict vs Actual 