In [1]:
import spacy
import json

import os
import openai

from tqdm import tqdm
import time

import numpy as np
np.random.seed(0)

openai.api_key = os.getenv("OPENAI_API_KEY")

dev = json.load(open("../data/individual_dev_end2end_final.json"))

get_lemma = lambda x:" ".join([token.lemma_ for token in nlp(x)])


nlp = spacy.load("en_core_web_sm")

relation_trigger = ['before', 'after', 'during', 'while']

warmup_qs = ['What will happen in the future?', 'What event has already finished?', 
             'What event has begun but has not finished?', 'What is happening now?',
             'What event has already happened?', 'What event has started?', 
            ]

def parse_question(q, event_lemmas):
    """
        input: q: question, events: the set of lemmatized events.
        output: 
            q_events: events in the question
            modality: whether there's "might/will/can/may/..."
            base_temp_rel: basic temporal relations, ["before", "after", "during", "while"]
    """
    # acquire the events in the question stem
    q_events = [e for e in [token.lemma_ for token in nlp(q)] if e in event_lemmas]
    
    second_prefix = q.split()[1]

    rel_trigger = [t for t in q.split() if t in relation_trigger]

    if len(rel_trigger) > 0:
        base_temp_rel = rel_trigger[0]
    else:
        base_temp_rel = ""
        
    return q_events, second_prefix, base_temp_rel

# ICL

In [2]:
train = json.load(open("../data/train_end2end_final.json"))

In [22]:
icl_keys = ['docid_AFP_ENG_19970418.0574_sentid_0_32VNZTT0A8TZERDTC9UMX5RMOW14RH_0',
            'docid_XIN_ENG_20061130.0405_sentid_0_31N2WW6R9SFHT5PGL0P96BLX90ZF33_8']

In [29]:
# construct examplar
exm_prompts = []
for key in icl_keys:
    item = train[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + f"\nA: {', '.join(ground)}"
    exm_prompts.append(prompt)
print("\n\n".join(exm_prompts))


Q: What event has already finished?, select none or several from {appointed, are, winner} 
Pope John Paul II on Friday appointed two Chinese scientists -- one from Taiwan , the other from the mainland -- to the Pontifical Academy of Sciences . The two are Chin Ningyang , 74 , from Hefei in China , the 1957 Nobel Physics Prize winner and son of a mathematics professor at Beijing university .
A: appointed, winner

Q: What will happen after Mora is named Attorney General?, select none or several from {named, assist, start, named} 
Mexico 's president-elect Felipe Calderon , of the ruling National Action Party ( PAN ) , named on Thursday the last four cabinet ministers who will assist him in his six-year term to start on Dec. 1 . Calderon named Eduardo Medina Mora as Attorney General , Genaro Garcia Luna as Public Security Minister , Guillermo Galvan Galvan as Minister of National Defense and Mariano Francisco Sainez Mendoza as Navy Minister .
A: assist, start


In [168]:
exm_prompts[0]

'Q: What event has already finished?, select none or several from {appointed, are, winner} \nPope John Paul II on Friday appointed two Chinese scientists -- one from Taiwan , the other from the mainland -- to the Pontifical Academy of Sciences . The two are Chin Ningyang , 74 , from Hefei in China , the 1957 Nobel Physics Prize winner and son of a mathematics professor at Beijing university .\nA: appointed, winner'

In [171]:
exm_prompts[1]

"Q: What will happen after Mora is named Attorney General?, select none or several from {named, assist, start, named} \nMexico 's president-elect Felipe Calderon , of the ruling National Action Party ( PAN ) , named on Thursday the last four cabinet ministers who will assist him in his six-year term to start on Dec. 1 . Calderon named Eduardo Medina Mora as Attorney General , Genaro Garcia Luna as Public Security Minister , Guillermo Galvan Galvan as Minister of National Defense and Mariano Francisco Sainez Mendoza as Navy Minister .\nA: assist, start"

In [170]:
icl_results_0 = {}
for key in tqdm(dev):
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    while True:
        try:
            icl_results_0[key] = openai.Completion.create(
                      model="text-davinci-003",
                      prompt="\n\n".join(exm_prompts[0]) + "\n\n" + prompt,
                      max_tokens=256,
                      temperature=0
            )["choices"][0]["text"].strip()
            break
        except:
            time.sleep(10)
    
    time.sleep(1.5)

  0%|▍                                                                                                                   | 5/1483 [00:11<57:51,  2.35s/it]


KeyboardInterrupt: 

# test gpt3

In [43]:
np.random.seed(0)
sampled_keys = np.random.choice(list(dev.keys()), 200, replace=False)
the_rest_keys = list(set(dev.keys()) - set(sampled_keys))

In [44]:
icl_results = {}
for key in tqdm(sampled_keys):
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    
    icl_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt="\n\n".join(exm_prompts) + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [09:26<00:00,  2.83s/it]


In [55]:
for key in tqdm(the_rest_keys):
    if key in icl_results:
        continue
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    
    icl_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt="\n\n".join(exm_prompts) + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1283/1283 [26:50<00:00,  1.26s/it]


In [54]:
1

1

In [46]:
from eval_func_gpt3 import evaluate

In [155]:
evaluate(icl_results, dev)

Total 1483 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4224
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4562
Eval on the current eval positive class Macro F1 (Agg) is: 0.3760
Eval on the current eval exact match (Agg) ratio is: 0.0526
Eval on the current eval exact match ratio (Relaxed) is: 0.0762
Eval on 485 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0021
Eval on the current eval clustered EM (Relaxed) is: 0.0021
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0165


In [158]:
for cls in existing_results:
    print(cls)
    evaluate(dict([(key, icl_results[key]) for key in existing_results[cls]]), 
             dict([(key, dev[key]) for key in existing_results[cls]]))
    print("\n\n")

erp
Total 88 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.5674
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.6340
Eval on the current eval positive class Macro F1 (Agg) is: 0.5426
Eval on the current eval exact match (Agg) ratio is: 0.0455
Eval on the current eval exact match ratio (Relaxed) is: 0.0795
Eval on 64 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0312
Eval on the current eval clustered EM (Relaxed) is: 0.0625
Eval on the current eval clusrered F1 (max>=0.8) is: 0.3125



erp_warm
Total 66 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.5240
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5858
Eval on the current eval positive class Macro F1 (Agg) is: 0.5199
Eval on the current eval exact match (Agg) ratio is: 0.0758
Eval on the current eval exact match ratio (Relaxed) is: 0.1515
Eval on 65 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0769
Eval on the current eval 

# zero-shot

In [49]:
zs_results = {}
for key in tqdm(sampled_keys):
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    
    zs_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [09:34<00:00,  2.87s/it]


In [61]:
for key in tqdm(the_rest_keys):
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    
    zs_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)

  3%|███▎                                                                                                             | 37/1283 [01:50<1:02:04,  2.99s/it]


KeyboardInterrupt: 

In [50]:
evaluate(zs_results, dict([(k, dev[k]) for k in sampled_keys]))

200it [00:00, 27368.14it/s]

Total 200 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4078
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4359
Eval on the current eval positive class Macro F1 (Agg) is: 0.3459
Eval on the current eval exact match (Agg) ratio is: 0.0450
Eval on the current eval exact match ratio (Relaxed) is: 0.0850
Eval on 161 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0373
Eval on the current eval clustered EM (Relaxed) is: 0.0621
Eval on the current eval clusrered F1 (max>=0.8) is: 0.1615





In [74]:
np.array(dev[key]['context'])

array(['Nonetheless', ',', 'concern', 'about', 'the', 'chip', 'may',
       'have', 'been', 'responsible', 'for', 'a', 'decline', 'of', '87.5',
       'cents', 'in', 'Intel', "'s", 'stock', 'to', '$', '32', 'a',
       'share', 'yesterday', 'in', 'over-the-counter', 'trading', ',',
       'on', 'volume', 'of', '3,609,800', 'shares', ',', 'and', 'partly',
       'responsible', 'for', 'a', 'drop', 'in', 'Compaq', "'s", 'stock',
       'in', 'New', 'York', 'Stock', 'Exchange', 'composite', 'trading',
       'on', 'Wednesday', '.', 'Yesterday', ',', 'Compaq', 'plunged',
       'further', ',', 'closing', 'at', '$', '100', 'a', 'share', ',',
       'off', '$', '8.625', 'a', 'share', ',', 'on', 'volume', 'of',
       '2,633,700', 'shares', '.'], dtype='<U16')

In [82]:
import pandas as pd

def convert_to_text(context, labels):
    return ", ".join(pd.Series(context)[pd.Series(labels, dtype=bool)])

# previous results

In [62]:
zs_pred_prev = json.load(open("gpt3_pred_dev.json"))

In [85]:
evaluate(dict([(key, convert_to_text(dev[key]['context'], val)) for key, val in zs_pred_prev.items()]), dev)

1483it [00:00, 37519.85it/s]


Total 1483 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.3987
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4548
Eval on the current eval positive class Macro F1 (Agg) is: 0.3696
Eval on the current eval exact match (Agg) ratio is: 0.0553
Eval on the current eval exact match ratio (Relaxed) is: 0.0836
Eval on 485 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0021
Eval on the current eval clustered EM (Relaxed) is: 0.0082
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0165


In [86]:
icl_pred_prev = json.load(open("./gpt3_fewshot_pred_dev.json"))

In [160]:
oneshot_icl_pred_prev = json.load(open("./gpt3_oneshot_pred_dev.json"))

In [87]:
evaluate(dict([(key, convert_to_text(dev[key]['context'], val)) for key, val in icl_pred_prev.items()]), dev)

1483it [00:00, 64296.97it/s]


Total 1483 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4180
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4604
Eval on the current eval positive class Macro F1 (Agg) is: 0.3824
Eval on the current eval exact match (Agg) ratio is: 0.0432
Eval on the current eval exact match ratio (Relaxed) is: 0.0593
Eval on 485 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0041
Eval on the current eval clustered EM (Relaxed) is: 0.0041
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0144


In [89]:
biased_dev = {
    "erp":json.load(open("../data/dataset_bias_new/individual_dev_erp_bias.json")),
    "erp_warm":json.load(open("../data/dataset_bias_new/individual_dev_warmup_answer_bias.json")),
    "narrative":json.load(open("../data/dataset_bias_new/individual_dev_narrative_bias.json")),
    "tense":json.load(open("../data/dataset_bias_new/individual_dev_tense_relation_bias.json")),
    "tense_warm":json.load(open("../data/dataset_bias_new/individual_dev_warmup_tense_bias.json")),
    "dependency":json.load(open("../data/dataset_bias_new/individual_dev_dependency_bias.json")),
}

In [91]:
all_existing_keys = set([])
for c, dev_subset in biased_dev.items():
    print(c)
    all_existing_keys = all_existing_keys | set(dev_subset.keys())

erp
erp_warm
narrative
tense
tense_warm
dependency


In [93]:
other_keys = set(dev.keys()) - set(all_existing_keys)

In [162]:
evaluate(dict([(key, convert_to_text(dev[key]['context'], val)) for key, val in oneshot_icl_pred_prev.items()]), dev)

Total 1483 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4053
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4430
Eval on the current eval positive class Macro F1 (Agg) is: 0.3642
Eval on the current eval exact match (Agg) ratio is: 0.0290
Eval on the current eval exact match ratio (Relaxed) is: 0.0472
Eval on 485 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0000
Eval on the current eval clustered EM (Relaxed) is: 0.0000
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0062


In [164]:
for cls in existing_results:
    print(cls)
    evaluate(dict([(key, convert_to_text(dev[key]['context'], oneshot_icl_pred_prev[key])) for key in existing_results[cls]]), 
             dict([(key, dev[key]) for key in existing_results[cls]]))

    print("\n\n")

erp
Total 88 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.5709
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.6117
Eval on the current eval positive class Macro F1 (Agg) is: 0.5452
Eval on the current eval exact match (Agg) ratio is: 0.0455
Eval on the current eval exact match ratio (Relaxed) is: 0.0795
Eval on 64 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0469
Eval on the current eval clustered EM (Relaxed) is: 0.0469
Eval on the current eval clusrered F1 (max>=0.8) is: 0.2031



erp_warm
Total 66 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.5089
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5983
Eval on the current eval positive class Macro F1 (Agg) is: 0.5078
Eval on the current eval exact match (Agg) ratio is: 0.0606
Eval on the current eval exact match ratio (Relaxed) is: 0.1212
Eval on 65 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0615
Eval on the current eval 

In [105]:
# not warm up questions!!!!
np.random.seed(0)
generated_examplars_cf = {}
generated_examplars_answers_cf = {}
examplar_prompts_cf = {}

for key in tqdm(other_keys):
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c.lower() for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c.lower() for c, t in zip(item['context'], item["answers"]["labels"]) if t]

    event_lemmas = [get_lemma(e) for e in all_events]
    
    if not question in warmup_qs:
        question_events, _, rel = parse_question(question, event_lemmas)
        gpt3_answers = [w.lower().strip().replace(".", "") for w in icl_results[key].split(",")]
        selected_ans = list(set(all_events) - set(gpt3_answers))
#         print(len(selected_ans), len(gpt3_answers), len(all_events))
#         print(selected_ans, gpt3_answers, all_events)
        if len(question_events) == 0:
            prompt = f"Write a story where {question}, {', '.join(selected_ans)} within 100 words"
        else:
            prompt = f"Write a story where '{', '.join(selected_ans)}' happened {rel} '{question_events[0]}' within 100 words:"
    else:
        continue
    
    generated_examplars_answers_cf[key] = selected_ans

    generated_examplars_cf[key] = openai.Completion.create(
                                          model="text-davinci-003",
                                          prompt=prompt,
                                          max_tokens=100,
                                          temperature=0
                                )["choices"][0]["text"].strip()
    time.sleep(1.5)
    # 
#     question = f"What happened {rel} {question_events[0]}?"
    context = generated_examplars_cf[key]
    gen_answers = generated_examplars_answers_cf[key]
    ex_prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA: " + ', '.join(gen_answers)

    examplar_prompts_cf[key] = ex_prompt

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1040/1040 [1:44:03<00:00,  6.00s/it]


In [109]:
question_generator = {
    'What will happen in the future?': 'will happen in the future', 
    'What event has already finished?': 'have happened', 
    'What event has begun but has not finished?': 'have begun but have not finished', 
    'What is happening now?': 'is happening now',
    'What event has already happened?': 'have happened', 
    'What event has started?':'have started happening', 
                     }

for key in tqdm(other_keys):
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c.lower() for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c.lower() for c, t in zip(item['context'], item["answers"]["labels"]) if t]

    event_lemmas = [get_lemma(e) for e in all_events]
    
    if question in warmup_qs:
#     question_events, _, rel = parse_question(question, event_lemmas)
        gpt3_answers = [w.lower().strip().replace(".", "") for w in icl_results[key].split(",")]
        selected_ans = list(set(all_events) - set(gpt3_answers))
    #         print(len(selected_ans), len(gpt3_answers), len(all_events))
    #         print(selected_ans, gpt3_answers, all_events)
        prompt = f"Write a story where '{', '.join(selected_ans)}' {question_generator[question]} within 100 words:"
    else:
        continue
    
    generated_examplars_answers_cf[key] = selected_ans

    generated_examplars_cf[key] = openai.Completion.create(
                                          model="text-davinci-003",
                                          prompt=prompt,
                                          max_tokens=100,
                                          temperature=0
                                )["choices"][0]["text"].strip()
    time.sleep(2)

    context = generated_examplars_cf[key]
    gen_answers = generated_examplars_answers_cf[key]
    ex_prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA: " + ', '.join(gen_answers)

    examplar_prompts_cf[key] = ex_prompt

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1040/1040 [10:38<00:00,  1.63it/s]


In [100]:
len(zs_results)

238

In [102]:
selected_ans

['killed']

True

In [111]:
cf_gen_cf_results = {}
for key in tqdm(other_keys):
    item = dev[key]
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    examplar = examplar_prompts_cf[key]
    
    cf_gen_cf_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=examplar + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1040/1040 [57:45<00:00,  3.33s/it]


In [113]:
existing_results = {
    "erp":json.load(open("data/erp_examplar_cf_gpt3_oneshot_pred.json")),
    "erp_warm":json.load(open("data/warmup_ans_examplar_cf_gpt3_oneshot_pred.json")),
    "narrative":json.load(open("data/narrative_examplar_cf_gpt3_oneshot_pred.json")),
    "tense":json.load(open("data/tense_examplar_cf_gpt3_oneshot_pred.json")),
    "tense_warm":json.load(open("data/warmup_tense_examplar_gpt3_oneshot_pred.json")),
    "dependency":json.load(open("data/dep_examplar_gpt3_oneshot_pred.json"))
}

In [115]:
existing_results['erp']

all_pred = {}
for key, val in existing_results.items():
    all_pred = {**all_pred, **val}
all_pred = {**all_pred, **cf_gen_cf_results}

In [117]:
evaluate(all_pred, dev)

1483it [00:00, 42532.12it/s]


Total 1483 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.0731
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.2834
Eval on the current eval positive class Macro F1 (Agg) is: 0.1980
Eval on the current eval exact match (Agg) ratio is: 0.1693
Eval on the current eval exact match ratio (Relaxed) is: 0.2441
Eval on 485 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0082
Eval on the current eval clustered EM (Relaxed) is: 0.0206
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0206


In [134]:
evaluate(cf_gen_cf_results, dict([(key, dev[key]) for key in cf_gen_cf_results]))

1040it [00:00, 36584.47it/s]

Total 1040 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.3633
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4032
Eval on the current eval positive class Macro F1 (Agg) is: 0.3231
Eval on the current eval exact match (Agg) ratio is: 0.0365
Eval on the current eval exact match ratio (Relaxed) is: 0.0606
Eval on 450 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0000
Eval on the current eval clustered EM (Relaxed) is: 0.0044
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0244





In [135]:
bias_pred = {}
for key, val in existing_results.items():
    bias_pred = {**bias_pred, **val}

In [136]:
evaluate(bias_pred, dict([(key, dev[key]) for key in bias_pred]))

443it [00:00, 44903.86it/s]

Total 443 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4796
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5477
Eval on the current eval positive class Macro F1 (Agg) is: 0.4706
Eval on the current eval exact match (Agg) ratio is: 0.0181
Eval on the current eval exact match ratio (Relaxed) is: 0.0316
Eval on 291 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0241
Eval on the current eval clustered EM (Relaxed) is: 0.0412
Eval on the current eval clusrered F1 (max>=0.8) is: 0.2131





In [144]:
set(dev.keys()) - set({**bias_pred, **cf_gen_cf_results}.keys())

set()

In [149]:
# len(dev.keys()), len({**bias_pred, **cf_gen_cf_results})
tmp_dev = dict([(key, dev[key]) for key in {**bias_pred, **cf_gen_cf_results}])

In [151]:
all([dev[key] == tmp_dev[key] for key in dev])

True

In [153]:
evaluate({**bias_pred, **cf_gen_cf_results}, dict([(key, dev[key]) for key in {**bias_pred, **cf_gen_cf_results}]))

Total 1483 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4032
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4463
Eval on the current eval positive class Macro F1 (Agg) is: 0.3671
Eval on the current eval exact match (Agg) ratio is: 0.0310
Eval on the current eval exact match ratio (Relaxed) is: 0.0519
Eval on 485 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0000
Eval on the current eval clustered EM (Relaxed) is: 0.0021
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0186


In [154]:
evaluate({**bias_pred, **cf_gen_cf_results}, dev)

Total 1483 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4032
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4463
Eval on the current eval positive class Macro F1 (Agg) is: 0.3671
Eval on the current eval exact match (Agg) ratio is: 0.0310
Eval on the current eval exact match ratio (Relaxed) is: 0.0519
Eval on 485 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0000
Eval on the current eval clustered EM (Relaxed) is: 0.0021
Eval on the current eval clusrered F1 (max>=0.8) is: 0.0186


# evaluation

In [152]:
def evaluate(pred_dict, data_dict):
    

    def parse_results(text):
        words = [w.lower().strip().replace(".", "") for w in text.split(",")]
        return list(set(words))

    parsed_tokens = [parse_results(res) for key, res in pred_dict.items()]

    # convert the parsed tokens into one-hot predictions
    import numpy as np
    preds = []
    labels = []
    eval_idv_answers = []
    question_cluster_size = []
    question_cluster = []
    question_ids = []

#     for (key, item), ans in tqdm(zip(data_dict.items(), parsed_tokens)):
    for key in data_dict:
        item, ans = data_dict[key], parse_results(pred_dict[key])
        preds.append([1 if t.lower() in ans and t != "none" else 0 for t in item['context']])
        labels.append(item["answers"]["labels"])
        eval_idv_answers.append([a['labels'] for a in item['individual_answers']])
        question_cluster_size.append(item['cluster_size'])
        question_cluster.append(item["question_cluster"])
        question_ids.append(key)
    question_ids = [q for i, q in enumerate(question_ids) for x in range(len(labels[i]))]

    from collections import Counter, defaultdict



    label_map = {0: 'Negative', 1: 'Positive'}
    eval_loss, eval_accuracy, nb_eval_examples, nb_eval_steps = 0.0, 0.0, 0, 0
    all_preds, all_golds, max_f1s, macro_f1s = [], [], [], []
    f1_dist = defaultdict(list)
    em_counter = 0
    em_cluster_agg, em_cluster_relaxed, f1_cluster_80 = {}, {}, {}

    for idx in range(len(data_dict)):

        pred = preds[idx]
        all_preds.extend(pred)
        label = labels[idx]
        all_golds.extend(label)
        pred_names = [label_map[p] for p in pred]
        gold_names = [label_map[l] for l in label]
        is_em = (pred_names == gold_names)

        if sum(label) == 0 and sum(pred) == 0:
            macro_f1s.append(1.0)
        else:
            macro_f1s.append(cal_f1(pred_names, gold_names, {v:k for k,v in label_map.items()}))

        max_f1, instance_matched = 0, 0
        for gold in eval_idv_answers[idx]:
            label_names = [label_map[l] for l in gold]
            if pred_names == label_names: instance_matched = 1
            if sum(gold) == 0 and sum(pred) == 0:
                f1 = 1.0
            else:
                f1 = cal_f1(pred_names, label_names, {v:k for k,v in label_map.items()})
            # if f1 > max_f1: max_f1 = f1
            if f1 >= max_f1:
                max_f1 = f1
                key = len(gold)

        if question_cluster_size[idx] > 1:
            if question_cluster[idx] not in em_cluster_agg:
                em_cluster_agg[question_cluster[idx]] = 1
            if is_em == 0: em_cluster_agg[question_cluster[idx]] = 0

            if question_cluster[idx] not in em_cluster_relaxed:
                em_cluster_relaxed[question_cluster[idx]] = 1
            if instance_matched == 0: em_cluster_relaxed[question_cluster[idx]] = 0

            if question_cluster[idx] not in f1_cluster_80:
                f1_cluster_80[question_cluster[idx]] = 1
            if max_f1 < 0.8: f1_cluster_80[question_cluster[idx]] = 0

        max_f1s.append(max_f1)
        em_counter += instance_matched
        f1_dist[key].append(max_f1)

    assert len(all_preds) == len(question_ids)
    assert len(f1_cluster_80) == len(em_cluster_agg) 

    # em = exact_match(question_ids, all_golds, all_preds)
    eval_accuracy = eval_accuracy / len(all_preds)
    label_names = [label_map[l] for l in all_golds]
    pred_names = [label_map[p] for p in all_preds]
    # eval_pos_f1 = cal_f1(pred_names, label_names, {v:k for k,v in label_map.items()})

    em_cluster_relaxed_res = sum(em_cluster_relaxed.values()) / len(em_cluster_relaxed)
    em_cluster_agg_res = sum(em_cluster_agg.values()) / len(em_cluster_agg)
    f1_cluster_80_res = sum(f1_cluster_80.values()) / len(f1_cluster_80)

    label_names = [label_map[l] for l in all_golds]
    pred_names = [label_map[p] for p in all_preds]

    em = exact_match(question_ids, label_names, pred_names)
    eval_pos_f1 = cal_f1(pred_names, label_names, {v:k for k,v in label_map.items()})


    print(f"Eval on the current eval positive class Micro F1 (Agg) is: %.4f" % eval_pos_f1)
    print(f"Eval on the current eval positive class Macro F1 (Relaxed) is: %.4f" % np.mean(max_f1s)) # output F1
    print(f"Eval on the current eval positive class Macro F1 (Agg) is: %.4f" % np.mean(macro_f1s))

    print(f"Eval on the current eval exact match (Agg) ratio is: %.4f" % em)
    print(f"Eval on the current eval exact match ratio (Relaxed) is: %.4f" % (em_counter / len(data_dict))) # output EM

    print(f"Eval on %d Clusters" % len(em_cluster_relaxed))
    print(f"Eval on the current eval clustered EM (Agg) is: %.4f" % (em_cluster_agg_res))
    print(f"Eval on the current eval clustered EM (Relaxed) is: %.4f" % (em_cluster_relaxed_res))
    print(f"Eval on the current eval clusrered F1 (max>=0.8) is: %.4f" % (f1_cluster_80_res)) # consistency
