In [1]:
import os
import openai
import time
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score, classification_report

openai.api_key = os.getenv("OPENAI_API_KEY")

import json
import numpy as np

np.random.seed(0)
features_valid = json.load(open("../data/valid_subset_text.json"))
features_valid_erp = json.load(open("../dataset_bias/valid_subset_text_erp.json"))
features_valid_tense_all = json.load(open("../dataset_bias/valid_subset_text_tense_bias_vague.json"))

# features_valid_tense_vague = [item for item in features_valid_tense_all if item['labels'][0] == 3]
# features_valid_dep = json.load(open("../dataset_bias/valid_text_features_matres_dep_bias.json"))

# ICL, m way, k-shot

"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"

In [2]:
train = json.load(open("../data/train_text_features_matres.json"))

In [4]:
train_by_labels = [[item for item in train if item['labels'][0] == i] for i in range(4)]

In [5]:
# 1-shot
np.random.seed(0)
examplars_1_shot_0 = [np.random.choice(train_by_labels[i], 1) for i in range(4)]

In [6]:
examplars_1_shot_0

[array([{'text': "[CLS] Jim Unruh, Unisys's president, said he is approaching next year with caution.[SEP] He said the strength of the world-wide economy is suspect, and doesn't see much revenue growth in the cards.[SEP] He also said that the price wars flaring up in parts of the computer industry will continue through next year.[SEP] He said the move toward standard operating systems means customers aren't locked into buying from their traditional computer supplier and can force prices down.[SEP]", 'e1': 'suspect', 'e2': 'flaring', 'labels': [0]}],
       dtype=object),
 array([{'text': '[CLS] The latest results include some unusual write-downs, which had an after-tax impact of $4.9 million.[SEP] Those included costs associated with the potential Valley Federal Savings and Loan Association acquisition, which was terminated on Sept. 27, 1989.[SEP] In addition, operating results were hit by an increase in loan and real estate loss reserves.[SEP]', 'e1': 'included', 'e2': 'terminated', '

In [30]:
examplar_0 = []

for i in range(4):
    item = examplars_1_shot_0[i][0]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    label = item['labels'][0]
    e1 = item['e1']
    e2 = item['e2']
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise. {convert_dict_rev[label]}"
    examplar_0.append(prompt)
examplar_0 = "\n\n".join(examplar_0)
    

In [33]:
examplar_0

'Given the document Jim Unruh, Unisys\'s president, said he is approaching next year with caution. He said the strength of the world-wide economy is suspect, and doesn\'t see much revenue growth in the cards. He also said that the price wars flaring up in parts of the computer industry will continue through next year. He said the move toward standard operating systems means customers aren\'t locked into buying from their traditional computer supplier and can force prices down. and a list of temporal relations [before, after, vague, equal] and event triggers suspect and flaring. what is the temporal relation between suspect and flaring? Answer vague if unsure. Keep the answer short and concise. before\n\nGiven the document The latest results include some unusual write-downs, which had an after-tax impact of $4.9 million. Those included costs associated with the potential Valley Federal Savings and Loan Association acquisition, which was terminated on Sept. 27, 1989. In addition, operati

In [32]:
with open("prompts/prompt_3_icl_examplar_1_shot_0.txt", "w") as writer:
    writer.writelines(examplar_0)

In [34]:
oneshot_results_0 = []
for i in tqdm(range(len(features_valid))):
    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise"
#     print(prompt)
    while True:
        try:
            oneshot_results_0.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_0 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [50:44<00:00,  3.04s/it]


In [29]:
convert_dict = {'BEFORE':0, 'AFTER':1, 'EQUAL':2, 'VAGUE':3}
convert_dict_rev = {0:'before', 1:'after', 2:'equal', 3:'vague'}
def parse_result(ans):
    return convert_dict[ans.upper()]


In [38]:
oneshot_preds_0 = [parse_result(item['choices'][0]['text'].replace('.', '').strip()) for item in oneshot_results_0]

In [39]:
labels = [features_valid[i]['labels'][0] for i in range(len(features_valid))]

print(f1_score(labels, oneshot_preds_0, average='macro'), f1_score(labels, oneshot_preds_0, average="micro"))

0.18626754503181908 0.327


In [40]:
with open("results/template_3_oneshot_pred_0.json", "w") as writer:
    json.dump(oneshot_preds_0, writer)


## run -1

In [51]:
# 1-shot
np.random.seed(1)
examplars_1_shot_1 = [np.random.choice(train_by_labels[i], 1) for i in range(4)]

examplar_1 = []

for i in range(4):
    item = examplars_1_shot_1[i][0]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    label = item['labels'][0]
    e1 = item['e1']
    e2 = item['e2']
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise. {convert_dict_rev[label]}"
    examplar_1.append(prompt)
examplar_1 = "\n\n".join(examplar_1)

with open("prompts/prompt_3_icl_examplar_1_shot_1.txt", "w") as writer:
    writer.writelines(examplar_1)
    

In [52]:
oneshot_results_1 = []
for i in tqdm(range(len(features_valid))):
    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise"
#     print(prompt)
    while True:
        try:
            oneshot_results_1.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_1 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [51:52<00:00,  3.11s/it]


In [53]:
oneshot_preds_1 = [parse_result(item['choices'][0]['text'].replace('.', '').strip()) for item in oneshot_results_1]

print(f1_score(labels, oneshot_preds_1, average='macro'), f1_score(labels, oneshot_preds_1, average="micro"))

0.20682353870120082 0.344


In [54]:
with open("results/template_3_oneshot_pred_1.json", "w") as writer:
    json.dump(oneshot_preds_1, writer)


## run-2

In [47]:
# 1-shot
np.random.seed(2)
examplars_1_shot_2 = [np.random.choice(train_by_labels[i], 1) for i in range(4)]

examplar_2 = []

for i in range(4):
    item = examplars_1_shot_2[i][0]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    label = item['labels'][0]
    e1 = item['e1']
    e2 = item['e2']
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise. {convert_dict_rev[label]}"
    examplar_2.append(prompt)
examplar_2 = "\n\n".join(examplar_2)

with open("prompts/prompt_3_icl_examplar_1_shot_2.txt", "w") as writer:
    writer.writelines(examplar_2)
    

In [48]:
oneshot_results_2 = []
for i in tqdm(range(len(features_valid))):
    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise"
#     print(prompt)
    while True:
        try:
            oneshot_results_2.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_2 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [52:27<00:00,  3.15s/it]


In [49]:
oneshot_preds_2 = [parse_result(item['choices'][0]['text'].replace('.', '').strip()) for item in oneshot_results_2]

print(f1_score(labels, oneshot_preds_2, average='macro'), f1_score(labels, oneshot_preds_2, average="micro"))

0.1780953431113662 0.288


In [50]:
with open("results/template_3_oneshot_pred_2.json", "w") as writer:
    json.dump(oneshot_preds_2, writer)


# 3-shot

run 0

In [None]:
# 1-shot
np.random.seed(1)
examplars_3_shot_0 = [np.random.choice(train_by_labels[i], 3) for i in range(4)]

examplar_3_0 = []

for i in range(4):
    for j in range(3):
        item = examplars_3_shot_0[i][j]
        context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
        label = item['labels'][0]
        e1 = item['e1']
        e2 = item['e2']
        prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise. {convert_dict_rev[label]}"
        examplar_3_0.append(prompt)
    
examplar_3_0 = "\n\n".join(examplar_3_0)
    

In [None]:
with open("prompts/chan_prompt_icl_examplar_3_shot_0.txt", "w") as writer:
    writer.writelines(examplar_3_0)

In [None]:
threeshot_results_0 = []

# selected_subsubset = np.random.permutation(len(features_valid))[:200]
# np.save("subsubset_idx_200", selected_subsubset)
selected_subsubset = np.load("subsubset_idx_200.npy", allow_pickle=True)

for i in tqdm(selected_subsubset):

    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise"
#     print(prompt)
    while True:
        try:
            threeshot_results_0.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_3_0 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


In [None]:

labels_selected = [features_valid[i]['labels'][0] for i in selected_subsubset]
threeshot_preds =  [parse_result(item['choices'][0]['text'].strip()) for item in threeshot_results_0]

print(f1_score(labels_selected, threeshot_preds, average='macro'), f1_score(labels_selected, threeshot_preds, average="micro"))



run 1

In [35]:
# 1-shot
np.random.seed(2)
examplars_3_shot_1 = [np.random.choice(train_by_labels[i], 3) for i in range(4)]

examplar_3_1 = []

for i in range(4):
    for j in range(3):
        item = examplars_3_shot_1[i][j]
        context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
        label = item['labels'][0]
        e1 = item['e1']
        e2 = item['e2']
        prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise. {convert_dict_rev[label]}"
        examplar_3_1.append(prompt)
    
examplar_3_1 = "\n\n".join(examplar_3_1)

with open("prompts/prompt_3_icl_examplar_3_shot_1.txt_saved", "w") as writer:
    writer.writelines(examplar_3_1)
    

In [None]:
threeshot_results_1 = []

# selected_subsubset = np.random.permutation(len(features_valid))[:200]
# np.save("subsubset_idx_200", selected_subsubset)
selected_subsubset = np.load("subsubset_idx_200.npy", allow_pickle=True)

for i in tqdm(selected_subsubset):

    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise"
#     print(prompt)
    while True:
        try:
            threeshot_results_1.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_3_1 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


In [None]:

# labels_selected = [features_valid[i]['labels'][0] for i in selected_subsubset]
threeshot_pred_1 =  [parse_result(item['choices'][0]['text'].replace('.', '').strip()) for item in threeshot_results_1]

print(f1_score(labels_selected, threeshot_pred_1, average='macro'), f1_score(labels_selected, threeshot_pred_1, average="micro"))



run 2

In [None]:
# 1-shot
np.random.seed(3)
examplars_3_shot_2 = [np.random.choice(train_by_labels[i], 3) for i in range(4)]

examplar_3_2 = []

for i in range(4):
    for j in range(3):
        item = examplars_3_shot_2[i][j]
        context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
        label = item['labels'][0]
        e1 = item['e1']
        e2 = item['e2']
        prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise. {convert_dict_rev[label]}"
        examplar_3_2.append(prompt)
    
examplar_3_2 = "\n\n".join(examplar_3_2)

with open("prompts/prompt_3_icl_examplar_3_shot_2.txt_saved", "w") as writer:
    writer.writelines(examplar_3_2)
    

In [None]:
threeshot_results_2 = []

# selected_subsubset = np.random.permutation(len(features_valid))[:200]
# np.save("subsubset_idx_200", selected_subsubset)
selected_subsubset = np.load("subsubset_idx_200.npy", allow_pickle=True)

for i in tqdm(selected_subsubset):

    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Given the document {context} and a list of temporal relations [before, after, vague, equal] and event triggers {e1} and {e2}. what is the temporal relation between {e1} and {e2}? Answer vague if unsure. Keep the answer short and concise"
#     print(prompt)
    while True:
        try:
            threeshot_results_2.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_3_2 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


In [None]:

# labels_selected = [features_valid[i]['labels'][0] for i in selected_subsubset]
threeshot_pred_2 =  [parse_result(item['choices'][0]['text'].replace('.', '').strip()) for item in threeshot_results_2]

print(f1_score(labels_selected, threeshot_pred_2, average='macro'), f1_score(labels_selected, threeshot_pred_2, average="micro"))

