In [1]:
import os
import openai
import time
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score, classification_report

openai.api_key = os.getenv("OPENAI_API_KEY")

import json
import numpy as np

np.random.seed(0)
features_valid = json.load(open("../data/valid_subset_text.json"))
features_valid_erp = json.load(open("../dataset_bias/valid_subset_text_erp.json"))
features_valid_tense_all = json.load(open("../dataset_bias/valid_subset_text_tense_bias_vague.json"))

# features_valid_tense_vague = [item for item in features_valid_tense_all if item['labels'][0] == 3]
# features_valid_dep = json.load(open("../dataset_bias/valid_text_features_matres_dep_bias.json"))

# ICL, m way, k-shot

"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"

In [23]:
convert_dict = {'BEFORE':0, 'AFTER':1, 'EQUAL':2, 'VAGUE':3}
convert_dict_rev = {0:'before', 1:'after', 2:'equal', 3:'vague'}
def parse_result(ans):
    return convert_dict.get(ans.upper(), 3)


In [2]:
train = json.load(open("../data/train_text_features_matres.json"))
train_by_labels = [[item for item in train if item['labels'][0] == i] for i in range(4)]

In [4]:
# 1-shot
np.random.seed(0)
examplars_1_shot_0 = [np.random.choice(train_by_labels[i], 1) for i in range(4)]

In [5]:
examplars_1_shot_0

[array([{'text': "[CLS] Jim Unruh, Unisys's president, said he is approaching next year with caution.[SEP] He said the strength of the world-wide economy is suspect, and doesn't see much revenue growth in the cards.[SEP] He also said that the price wars flaring up in parts of the computer industry will continue through next year.[SEP] He said the move toward standard operating systems means customers aren't locked into buying from their traditional computer supplier and can force prices down.[SEP]", 'e1': 'suspect', 'e2': 'flaring', 'labels': [0]}],
       dtype=object),
 array([{'text': '[CLS] The latest results include some unusual write-downs, which had an after-tax impact of $4.9 million.[SEP] Those included costs associated with the potential Valley Federal Savings and Loan Association acquisition, which was terminated on Sept. 27, 1989.[SEP] In addition, operating results were hit by an increase in loan and real estate loss reserves.[SEP]', 'e1': 'included', 'e2': 'terminated', '

In [8]:
examplar_0 = []

for i in range(4):
    item = examplars_1_shot_0[i][0]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    label = item['labels'][0]
    e1 = item['e1']
    e2 = item['e2']
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer: {convert_dict_rev[label]}"
    examplar_0.append(prompt)
examplar_0 = "\n\n".join(examplar_0)
    

In [18]:
examplar_0

'Determine the temporal order from "suspect" to "flaring" in the following sentence: "Jim Unruh, Unisys\'s president, said he is approaching next year with caution. He said the strength of the world-wide economy is suspect, and doesn\'t see much revenue growth in the cards. He also said that the price wars flaring up in parts of the computer industry will continue through next year. He said the move toward standard operating systems means customers aren\'t locked into buying from their traditional computer supplier and can force prices down.". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer: before\n\nDetermine the temporal order from "included" to "terminated" in the following sentence: "The latest results include some unusual write-downs, which had an after-tax impact of $4.9 million. Those included costs associated with the potential Valley Federal Savings and Loan Association acquisition, which was terminated on Sept. 27, 1989. In addition, operating results were hit 

In [10]:
with open("prompts/prompt_2_icl_examplar_1_shot_0.txt", "w") as writer:
    writer.writelines(examplar_0)

In [11]:
oneshot_results_0 = []
for i in tqdm(range(len(features_valid))):
    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"
#     print(prompt)
    while True:
        try:
            oneshot_results_0.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_0 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [47:45<00:00,  2.87s/it]


In [22]:
from collections import Counter
Counter([item['choices'][0]['text'].strip() for item in oneshot_results_0])

Counter({'before': 857,
         'after': 128,
         'equal': 7,
         'seeking': 1,
         'keeping': 1,
         'representing': 1,
         'been': 2,
         'followed': 2,
         'are': 1})

In [24]:
oneshot_preds_0 = [parse_result(item['choices'][0]['text'].strip()) for item in oneshot_results_0]

In [25]:
labels = [features_valid[i]['labels'][0] for i in range(len(features_valid))]

print(f1_score(labels, oneshot_preds_0, average='macro'), f1_score(labels, oneshot_preds_0, average="micro"))

0.21993212254632177 0.499


In [34]:
with open("results/template_2_oneshot_pred_0.json", "w") as writer:
    json.dump(oneshot_preds_0, writer)


### run-1

In [26]:
# 1-shot
np.random.seed(1)
examplars_1_shot_1 = [np.random.choice(train_by_labels[i], 1) for i in range(4)]

examplar_1 = []

for i in range(4):
    item = examplars_1_shot_1[i][0]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    label = item['labels'][0]
    e1 = item['e1']
    e2 = item['e2']
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer: {convert_dict_rev[label]}"
    examplar_1.append(prompt)
examplar_1 = "\n\n".join(examplar_1)
    
with open("prompts/prompt_2_icl_examplar_1_shot_1.txt", "w") as writer:
    writer.writelines(examplar_1)

In [27]:
oneshot_results_1 = []
for i in tqdm(range(len(features_valid))):
    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"
#     print(prompt)
    while True:
        try:
            oneshot_results_1.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_1 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [48:08<00:00,  2.89s/it]


In [30]:
labels = [features_valid[i]['labels'][0] for i in range(len(features_valid))]
oneshot_preds_1 = [parse_result(item['choices'][0]['text'].strip()) for item in oneshot_results_1]

print(f1_score(labels, oneshot_preds_1, average='macro'), f1_score(labels, oneshot_preds_1, average="micro"))

0.22143296136408955 0.493


In [35]:
with open("results/template_2_oneshot_pred_1.json", "w") as writer:
    json.dump(oneshot_preds_1, writer)


## run-2

In [31]:
# 1-shot
np.random.seed(2)
examplars_1_shot_2 = [np.random.choice(train_by_labels[i], 1) for i in range(4)]

examplar_2 = []

for i in range(4):
    item = examplars_1_shot_2[i][0]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    label = item['labels'][0]
    e1 = item['e1']
    e2 = item['e2']
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer: {convert_dict_rev[label]}"
    examplar_2.append(prompt)
examplar_2 = "\n\n".join(examplar_2)
    
with open("prompts/prompt_2_icl_examplar_1_shot_2.txt", "w") as writer:
    writer.writelines(examplar_2)

In [32]:
oneshot_results_2 = []
for i in tqdm(range(len(features_valid))):
    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"
#     print(prompt)
    while True:
        try:
            oneshot_results_2.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_2 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [49:58<00:00,  3.00s/it]


In [33]:
labels = [features_valid[i]['labels'][0] for i in range(len(features_valid))]
oneshot_preds_2 = [parse_result(item['choices'][0]['text'].strip()) for item in oneshot_results_2]

print(f1_score(labels, oneshot_preds_2, average='macro'), f1_score(labels, oneshot_preds_2, average="micro"))

0.19800593031875463 0.501


In [None]:
with open("results/template_2_oneshot_pred_2.json", "w") as writer:
    json.dump(oneshot_preds_2, writer)


# 3-shot

### run-0

In [28]:
# 1-shot
np.random.seed(0)
examplars_3_shot_0 = [np.random.choice(train_by_labels[i], 3) for i in range(4)]

examplar_3_0 = []

for i in range(4):
    for j in range(3):
        item = examplars_3_shot_0[i][j]
        context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
        label = item['labels'][0]
        e1 = item['e1']
        e2 = item['e2']
        prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer: {convert_dict_rev[label]}"
        examplar_3_0.append(prompt)
    
examplar_3_0 = "\n\n".join(examplar_3_0)

with open("prompts/prompt_2_icl_examplar_3_shot_0.txt", "w") as writer:
    writer.writelines(examplar_3_0)
    

In [None]:
threeshot_results_0 = []

# selected_subsubset = np.random.permutation(len(features_valid))[:200]
# np.save("subsubset_idx_200", selected_subsubset)
selected_subsubset = np.load("subsubset_idx_200.npy", allow_pickle=True)

for i in tqdm(selected_subsubset):

    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"
#     print(prompt)
    while True:
        try:
            threeshot_results_0.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_3_0 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)

In [None]:

labels_selected = [features_valid[i]['labels'][0] for i in selected_subsubset]
threeshot_preds =  [parse_result(item['choices'][0]['text'].strip()) for item in threeshot_results_0]

print(f1_score(labels_selected, threeshot_preds, average='macro'), f1_score(labels_selected, threeshot_preds, average="micro"))

with open("results/template_2_threeshot_subsets_pred_0.json", "w") as writer:
    json.dump(threeshot_preds, writer)

In [None]:
def majority_vote(l):
    return sorted(Counter(l).items(), key = lambda x:x[1], reverse=True)[0][0]

In [17]:
from collections import Counter


1

### run -1

In [29]:
# 1-shot
np.random.seed(1)
examplars_3_shot_1 = [np.random.choice(train_by_labels[i], 3) for i in range(4)]

examplar_3_1 = []

for i in range(4):
    for j in range(3):
        item = examplars_3_shot_1[i][j]
        context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
        label = item['labels'][0]
        e1 = item['e1']
        e2 = item['e2']
        prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer: {convert_dict_rev[label]}"
        examplar_3_1.append(prompt)
    
examplar_3_1 = "\n\n".join(examplar_3_1)

with open("prompts/prompt_2_icl_examplar_3_shot_1.txt", "w") as writer:
    writer.writelines(examplar_3_1)
    

In [None]:
threeshot_results_1 = []

# selected_subsubset = np.random.permutation(len(features_valid))[:200]
# np.save("subsubset_idx_200", selected_subsubset)
selected_subsubset = np.load("subsubset_idx_200.npy", allow_pickle=True)

for i in tqdm(selected_subsubset):

    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"
#     print(prompt)
    while True:
        try:
            threeshot_results_1.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_3_1 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)

In [None]:

labels_selected = [features_valid[i]['labels'][0] for i in selected_subsubset]
threeshot_preds_1 =  [parse_result(item['choices'][0]['text'].strip()) for item in threeshot_results_1]

print(f1_score(labels_selected, threeshot_preds_1, average='macro'), f1_score(labels_selected, threeshot_preds_1, average="micro"))

with open("results/template_2_threeshot_subsets_pred_1.json", "w") as writer:
    json.dump(threeshot_preds_1, writer)

## run-2

In [None]:
# 1-shot
np.random.seed(2)
examplars_3_shot_2 = [np.random.choice(train_by_labels[i], 3) for i in range(4)]

examplar_3_2 = []

for i in range(4):
    for j in range(3):
        item = examplars_3_shot_2[i][j]
        context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
        label = item['labels'][0]
        e1 = item['e1']
        e2 = item['e2']
        prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer: {convert_dict_rev[label]}"
        examplar_3_2.append(prompt)
    
examplar_3_2 = "\n\n".join(examplar_3_2)

with open("prompts/prompt_2_icl_examplar_3_shot_2.txt", "w") as writer:
    writer.writelines(examplar_3_2)
    

In [None]:
threeshot_results_2 = []

# selected_subsubset = np.random.permutation(len(features_valid))[:200]
# np.save("subsubset_idx_200", selected_subsubset)
selected_subsubset = np.load("subsubset_idx_200.npy", allow_pickle=True)

for i in tqdm(selected_subsubset):

    item = features_valid[i]
    context = item['text'].replace("[CLS]", "").replace("[SEP]", "").strip()
    e1 = item['e1']
    e2 = item['e2']
    
    prompt = f"Determine the temporal order from \"{e1}\" to \"{e2}\" in the following sentence: \"{context}\". Only answer one word from AFTER, BEFORE, EQUAL, VAGUE. Answer:"
#     print(prompt)
    while True:
        try:
            threeshot_results_2.append(openai.Completion.create(
                        model="text-davinci-003",
                        prompt=examplar_3_2 + "\n\n" + prompt,
                        max_tokens=20,
                        temperature=0
            ))
            break
        except:
            time.sleep(10)
    time.sleep(2)

In [None]:

labels_selected = [features_valid[i]['labels'][0] for i in selected_subsubset]
threeshot_preds_2 =  [parse_result(item['choices'][0]['text'].strip()) for item in threeshot_results_2]

print(f1_score(labels_selected, threeshot_preds_2, average='macro'), f1_score(labels_selected, threeshot_preds_2, average="micro"))

with open("results/template_2_threeshot_subsets_pred_2.json", "w") as writer:
    json.dump(threeshot_preds_2, writer)