In [2]:
import spacy
import json

import os
import openai

from tqdm import tqdm
import time

import numpy as np
np.random.seed(0)

openai.api_key = os.getenv("OPENAI_API_KEY")

dev = json.load(open("../data/individual_dev_end2end_final.json"))

get_lemma = lambda x:" ".join([token.lemma_ for token in nlp(x)])


nlp = spacy.load("en_core_web_sm")

relation_trigger = ['before', 'after', 'during', 'while']

warmup_qs = ['What will happen in the future?', 'What event has already finished?', 
             'What event has begun but has not finished?', 'What is happening now?',
             'What event has already happened?', 'What event has started?', 
            ]


def parse_question(q, event_lemmas):
    """
        input: q: question, events: the set of lemmatized events.
        output: 
            q_events: events in the question
            modality: whether there's "might/will/can/may/..."
            base_temp_rel: basic temporal relations, ["before", "after", "during", "while"]
    """
    # acquire the events in the question stem
    q_events = [e for e in [token.lemma_ for token in nlp(q)] if e in event_lemmas]
    
    second_prefix = q.split()[1]

    rel_trigger = [t for t in q.split() if t in relation_trigger]

    if len(rel_trigger) > 0:
        base_temp_rel = rel_trigger[0]
    else:
        base_temp_rel = ""
        
    return q_events, second_prefix, base_temp_rel

In [6]:
prompt_warmup = {
    
    'What will happen in the future?':"will happen in the future",
    'What event has already finished?':"has already finished", 
    'What event has begun but has not finished?':"has begun but has not finished", 
    'What is happening now?':"is happening now",
    'What event has already happened?':"has already happened", 
    'What event has started?':"has started", 
}

In [None]:
emp_prompt = f"Write a story where '{', '.join(exmp_ans)}' {prompt_warmup[exmp_question]} within 100 words:"

In [4]:
exmp_context = "Pope John Paul II on Friday appointed two Chinese scientists -- one from Taiwan , the other from the mainland -- to the Pontifical Academy of Sciences . The two are Chin Ningyang , 74 , from Hefei in China , the 1957 Nobel Physics Prize winner and son of a mathematics professor at Beijing university ."
exmp_all_events = ["appointed", "are", "winner"]
exmp_ans = ["appointed", "winner"]
exmp_question = "What event has already finished?"

In [11]:
emp_prompt = f"Write a story where '{', '.join(exmp_ans)}' {prompt_warmup[exmp_question]} within 100 words:\n{exmp_context}"

In [12]:
emp_prompt

"Write a story where 'appointed, winner' has already finished within 100 words:\nPope John Paul II on Friday appointed two Chinese scientists -- one from Taiwan , the other from the mainland -- to the Pontifical Academy of Sciences . The two are Chin Ningyang , 74 , from Hefei in China , the 1957 Nobel Physics Prize winner and son of a mathematics professor at Beijing university ."

In [14]:
dev_warmup_tense = json.load(open("../data/dataset_bias_new/individual_dev_warmup_tense_bias.json"))

# with open("data/warmup_tense_examplar_gpt3_oneshot_pred.json", 'w') as outfile:
#     json.dump(oneshot_gen_results, outfile)
    
oneshot_gen_results = json.load(open("data/warmup_tense_examplar_gpt3_oneshot_pred.json"))

In [22]:
np.random.seed(0)
generated_examplars_cf = {}
generated_examplars_answers_cf = {}
examplar_prompts_cf = {}

for key, item in tqdm(dev_warmup_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c.lower() for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c.lower() for c, t in zip(item['context'], item["answers"]["labels"]) if t]

    event_lemmas = [get_lemma(e) for e in all_events]
    
#     if not question in warmup_qs:
#     question_events, _, rel = parse_question(question, event_lemmas)
    gpt3_answers = [w.lower().strip().replace(".", "") for w in oneshot_gen_results[key].split(",")]
    selected_ans = list(set(all_events) - set(gpt3_answers))
#         print(len(selected_ans), len(gpt3_answers), len(all_events))
#         print(selected_ans, gpt3_answers, all_events)
    prompt = f"Write a story where '{', '.join(selected_ans)}' {prompt_warmup[question]} within 100 words:"
#     else:
#         assert False
    
    generated_examplars_answers_cf[key] = selected_ans

    generated_examplars_cf[key] = openai.Completion.create(
                                          model="text-davinci-003",
                                          prompt=prompt,
                                          max_tokens=100,
                                          temperature=0
                                )["choices"][0]["text"].strip()
    time.sleep(2)
    # 
#     question = f"What happened {rel} {question_events[0]}?"
    context = generated_examplars_cf[key]
    gen_answers = generated_examplars_answers_cf[key]
    ex_prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA: " + ', '.join(gen_answers)

    examplar_prompts_cf[key] = ex_prompt


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [07:59<00:00,  6.76s/it]


In [17]:
examplar_prompts_cf

{'docid_AFP_ENG_20061201.0595_sentid_2_6': 'Q: What will happen in the future?, select none or several from {meet, requires, delivered, said, designed, compete, proved} \nThe future requires a new kind of delivery system. One that can meet the needs of a rapidly changing world. After years of research and development, the new system is finally ready. It is capable of delivering anything, anywhere, anytime. It is faster, more efficient, and more reliable than anything that has come before. The world is ready to meet the new system and see what it can do. With its help, the future looks brighter than ever.\nA: requires, delivered, meet'}

In [23]:
oneshot_gen_cf_results = {}
for key, item in tqdm(dev_warmup_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    examplar = examplar_prompts_cf[key]
    
    oneshot_gen_cf_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=examplar + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [03:43<00:00,  3.15s/it]


In [19]:
oneshot_gen_cf_results

{'docid_AFP_ENG_20061201.0595_sentid_2_6': 'meet, requires, delivered, said, designed, compete, proved'}

In [24]:
from eval_func_gpt3 import *

In [25]:
evaluate(oneshot_gen_cf_results, dev_warmup_tense)

71it [00:00, 36343.13it/s]

Total 71 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4232
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.4754
Eval on the current eval positive class Macro F1 (Agg) is: 0.4010
Eval on the current eval exact match (Agg) ratio is: 0.0141
Eval on the current eval exact match ratio (Relaxed) is: 0.0282
Eval on 70 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0143
Eval on the current eval clustered EM (Relaxed) is: 0.0286
Eval on the current eval clusrered F1 (max>=0.8) is: 0.1571





# random sample examplar

In [26]:
train = json.load(open("../data/train_end2end_final.json"))

In [33]:
np.random.choice(list(train.keys()), 1)

array(['docid_AFP_ENG_20061220.0595_sentid_4_3VP0C6EFSHLMWY4FQPRK84HQEUIM6U_0'],
      dtype='<U76')

In [37]:
item = train['docid_AFP_ENG_20061220.0595_sentid_4_3VP0C6EFSHLMWY4FQPRK84HQEUIM6U_0']
context = " ".join(item['context'])
question = item['question']
all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
examplar_prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + f"\nA: {', '.join(ground)}"

In [39]:
icl_cf_results = {}
for key, item in tqdm(dev_warmup_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    examplar = examplar_prompts_cf[key]
    
    icl_cf_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=examplar_prompt + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [03:38<00:00,  3.07s/it]


In [40]:
evaluate(icl_cf_results, dev_warmup_tense)

71it [00:00, 37210.49it/s]

Total 71 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4654
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5260
Eval on the current eval positive class Macro F1 (Agg) is: 0.4515
Eval on the current eval exact match (Agg) ratio is: 0.0563
Eval on the current eval exact match ratio (Relaxed) is: 0.0845
Eval on 70 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0571
Eval on the current eval clustered EM (Relaxed) is: 0.0857
Eval on the current eval clusrered F1 (max>=0.8) is: 0.2571





In [41]:
dev_warmup_ans = json.load(open("../data/dataset_bias_new/individual_dev_warmup_answer_bias.json"))

In [42]:
icl_cf_results_ans = {}
for key, item in tqdm(dev_warmup_ans.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    examplar = examplar_prompts_cf[key]
    
    icl_cf_results_ans[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=examplar_prompt + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 66/66 [03:29<00:00,  3.18s/it]


In [43]:
evaluate(icl_cf_results_ans, dev_warmup_ans)

66it [00:00, 61122.56it/s]

Total 66 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4849
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5559
Eval on the current eval positive class Macro F1 (Agg) is: 0.4736
Eval on the current eval exact match (Agg) ratio is: 0.0606
Eval on the current eval exact match ratio (Relaxed) is: 0.0909
Eval on 65 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0615
Eval on the current eval clustered EM (Relaxed) is: 0.0923
Eval on the current eval clusrered F1 (max>=0.8) is: 0.2769





In [44]:
dev_erp = json.load(open("../data/dataset_bias_new/individual_dev_erp_bias.json"))

In [46]:
icl_cf_results_erp = {}
for key, item in tqdm(dev_erp.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    icl_cf_results_erp[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=examplar_prompt + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)
evaluate(icl_cf_results_erp, dev_erp)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [04:52<00:00,  3.33s/it]
88it [00:00, 60045.35it/s]

Total 88 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.5288
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5802
Eval on the current eval positive class Macro F1 (Agg) is: 0.5054
Eval on the current eval exact match (Agg) ratio is: 0.0227
Eval on the current eval exact match ratio (Relaxed) is: 0.0341
Eval on 64 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0156
Eval on the current eval clustered EM (Relaxed) is: 0.0312
Eval on the current eval clusrered F1 (max>=0.8) is: 0.2031



