In [1]:
import spacy
import json

import os
import openai

from tqdm import tqdm
import time

import numpy as np
np.random.seed(0)

openai.api_key = os.getenv("OPENAI_API_KEY")

dev = json.load(open("../data/individual_dev_end2end_final.json"))

get_lemma = lambda x:" ".join([token.lemma_ for token in nlp(x)])


nlp = spacy.load("en_core_web_sm")

relation_trigger = ['before', 'after', 'during', 'while']

warmup_qs = ['What will happen in the future?', 'What event has already finished?', 
             'What event has begun but has not finished?', 'What is happening now?',
             'What event has already happened?', 'What event has started?', 
            ]

def parse_question(q, event_lemmas):
    """
        input: q: question, events: the set of lemmatized events.
        output: 
            q_events: events in the question
            modality: whether there's "might/will/can/may/..."
            base_temp_rel: basic temporal relations, ["before", "after", "during", "while"]
    """
    # acquire the events in the question stem
    q_events = [e for e in [token.lemma_ for token in nlp(q)] if e in event_lemmas]
    
    second_prefix = q.split()[1]

    rel_trigger = [t for t in q.split() if t in relation_trigger]

    if len(rel_trigger) > 0:
        base_temp_rel = rel_trigger[0]
    else:
        base_temp_rel = ""
        
    return q_events, second_prefix, base_temp_rel

In [16]:
zero_shot_results = {}
for key, item in tqdm(dev_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    zero_shot_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 243/243 [11:07<00:00,  2.75s/it]


In [17]:
evaluate(zero_shot_results, dev_tense)

243it [00:00, 67106.65it/s]

Total 243 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4699
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5346
Eval on the current eval positive class Macro F1 (Agg) is: 0.4544
Eval on the current eval exact match (Agg) ratio is: 0.0206
Eval on the current eval exact match ratio (Relaxed) is: 0.0412
Eval on 157 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0255
Eval on the current eval clustered EM (Relaxed) is: 0.0446
Eval on the current eval clusrered F1 (max>=0.8) is: 0.1656





# get examplars

In [2]:
dev_tense = json.load(open("../data/dataset_bias_new/individual_dev_tense_relation_bias.json"))

In [3]:
np.random.seed(0)
generated_examplars = {}
generated_examplars_answers = {}
examplar_prompts = {}

for key, item in tqdm(dev_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]

    event_lemmas = [get_lemma(e) for e in all_events]
    
    if not question in warmup_qs:
        question_events, _, rel = parse_question(question, event_lemmas)
        selected_ans = np.random.choice(all_events, min(3, len(all_events)), replace=False)
        prompt = f"Write a story where '{', '.join(selected_ans)}' happened {rel} '{question_events[0]}' within 100 words:"
    else:
        assert False
    
    generated_examplars_answers[key] = selected_ans

    generated_examplars[key] = openai.Completion.create(
                                          model="text-davinci-003",
                                          prompt=prompt,
                                          max_tokens=100,
                                          temperature=0
                                )["choices"][0]["text"].strip()
    time.sleep(2)
    # 
    question = f"What happened {rel} {question_events[0]}?"
    context = generated_examplars[key]
    gen_answers = generated_examplars_answers[key]
    ex_prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA: " + ', '.join(gen_answers)

    examplar_prompts[key] = ex_prompt

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 243/243 [21:41<00:00,  5.36s/it]


In [4]:
oneshot_gen_results = {}
for key, item in tqdm(dev_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    examplar = examplar_prompts[key]
    
    oneshot_gen_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=examplar + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 243/243 [11:16<00:00,  2.78s/it]


In [5]:
from eval_func_gpt3 import evaluate

In [13]:
evaluate(oneshot_gen_results, dev_tense)

243it [00:00, 28831.32it/s]

Total 243 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4844
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5606
Eval on the current eval positive class Macro F1 (Agg) is: 0.4807
Eval on the current eval exact match (Agg) ratio is: 0.0165
Eval on the current eval exact match ratio (Relaxed) is: 0.0288
Eval on 157 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0191
Eval on the current eval clustered EM (Relaxed) is: 0.0318
Eval on the current eval clusrered F1 (max>=0.8) is: 0.2420





In [14]:
# save examplars

with open("data/tense_examplar_gpt3.json", 'w') as outfile:
    json.dump(examplar_prompts, outfile)

In [15]:
with open("data/tense_examplar_gpt3_oneshot_pred.json", 'w') as outfile:
    json.dump(oneshot_gen_results, outfile)

# get counterfactual examplar w/ gpt3

counter-factual ones

passage, question, all_events.

use gpt3 to predict the relations.

E.g., what happened after e1??
predicts: e21, e22, ..., e2n

then, ask gpt3 to generate those that are not e21, e22, ..., e2n



In [19]:
np.random.seed(0)
generated_examplars_cf = {}
generated_examplars_answers_cf = {}
examplar_prompts_cf = {}

for key, item in tqdm(dev_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c.lower() for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c.lower() for c, t in zip(item['context'], item["answers"]["labels"]) if t]

    event_lemmas = [get_lemma(e) for e in all_events]
    
    if not question in warmup_qs:
        question_events, _, rel = parse_question(question, event_lemmas)
        gpt3_answers = [w.lower().strip().replace(".", "") for w in oneshot_gen_results[key].split(",")]
        selected_ans = list(set(all_events) - set(gpt3_answers))
#         print(len(selected_ans), len(gpt3_answers), len(all_events))
#         print(selected_ans, gpt3_answers, all_events)
        prompt = f"Write a story where '{', '.join(selected_ans)}' happened {rel} '{question_events[0]}' within 100 words:"
    else:
        assert False
    
    generated_examplars_answers_cf[key] = selected_ans

    generated_examplars_cf[key] = openai.Completion.create(
                                          model="text-davinci-003",
                                          prompt=prompt,
                                          max_tokens=100,
                                          temperature=0
                                )["choices"][0]["text"].strip()
    time.sleep(2)
    # 
    question = f"What happened {rel} {question_events[0]}?"
    context = generated_examplars_cf[key]
    gen_answers = generated_examplars_answers_cf[key]
    ex_prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA: " + ', '.join(gen_answers)

    examplar_prompts_cf[key] = ex_prompt

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 243/243 [21:14<00:00,  5.24s/it]


In [21]:
oneshot_gen_cf_results = {}
for key, item in tqdm(dev_tense.items()):
    context = " ".join(item['context'])
    question = item['question']
    all_events = [c for c, t in zip(item['context'], item["answers"]["types"]) if t]
    ground = [c for c, t in zip(item['context'], item["answers"]["labels"]) if t]
    prompt = f"Q: {question}" + ", select none or several from {" + ', '.join(all_events) + "} \n" + context + "\nA:"
    
    examplar = examplar_prompts_cf[key]
    
    oneshot_gen_cf_results[key] = openai.Completion.create(
              model="text-davinci-003",
              prompt=examplar + "\n\n" + prompt,
              max_tokens=256,
              temperature=0
    )["choices"][0]["text"].strip()
    
    time.sleep(2)
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 243/243 [11:22<00:00,  2.81s/it]


In [22]:
evaluate(oneshot_gen_cf_results, dev_tense)

243it [00:00, 56723.95it/s]

Total 243 questions
Eval on the current eval positive class Micro F1 (Agg) is: 0.4957
Eval on the current eval positive class Macro F1 (Relaxed) is: 0.5686
Eval on the current eval positive class Macro F1 (Agg) is: 0.4980
Eval on the current eval exact match (Agg) ratio is: 0.0370
Eval on the current eval exact match ratio (Relaxed) is: 0.0453
Eval on 157 Clusters
Eval on the current eval clustered EM (Agg) is: 0.0382
Eval on the current eval clustered EM (Relaxed) is: 0.0573
Eval on the current eval clusrered F1 (max>=0.8) is: 0.2420





In [23]:
with open("data/tense_examplar_cf_gpt3.json", 'w') as outfile:
    json.dump(examplar_prompts_cf, outfile)
with open("data/tense_examplar_cf_gpt3_oneshot_pred.json", 'w') as outfile:
    json.dump(oneshot_gen_cf_results, outfile)