- Explicit-Explicit Discovery: predict Arg2 sent-initial connective
- Explicit-Explicit PDTB: predict subsequent connective
- Explicit-Implicit: predict implicit 

In [85]:
import openai
import pandas as pd
import os
from tqdm import tqdm
from IPython.display import display, HTML

In [46]:
introduce_marker_symbol = '''\n  - Take note of the existing discourse marker enclosed by the character [ and ]'''

def prepare_system_prompt(type):
    prompt_file = ""
    if type == "free-insert":
        prompt_file = "free-insert_system_prompt.txt"
    elif type == "free-insert-co-occur":
        prompt_file = "free-insert-coo_system_prompt.txt"

    elif type == "mask-fill" or type == "mask-fill-with-marker-symbol":
        prompt_file = "mask-fill_system_prompt.txt"

    res = ""
    with open(prompt_file) as f:
        res = f.read()
    if "with-marker-symbol" in type:
        res = res.replace("<introduce_marker_symbol>", introduce_marker_symbol).replace("are curly and ___ are more", "are curly [and] ___ are more")
    else:
        res = res.replace("<introduce_marker_symbol>", "")
    return res

def prepare_user_prompt(arg1, conn1, arg2, type):
    res = ""
    with open("user_prompt.txt") as f:
        res = f.read()

    should_capitalize = arg1.rstrip().endswith(('.', '!', '?'))
    conn1 = conn1.capitalize() if should_capitalize else conn1

    if type == "free-insert" or type == "free-insert-co-occur":
        res = res.replace("<passage>", arg1 + " [" + conn1 + "] " + arg2)
    elif type == "mask-fill-with-marker-symbol":
        res = res.replace("<passage>", arg1 + " [" + conn1 + "] ___ " + arg2)
    elif type == "mask-fill":
        res = res.replace("<passage>", arg1 + " " + conn1 + " ___ " + arg2)
    return res


In [None]:
## CREATE PROMPT SAMPLE

# arg1 = '''Jacobs may have gotten a temporary reprieve in the eyes of many rank-and-file fans with the hiring of Coach Malzahn, who has made all the right moves so far.'''
# conn1 = "ultimately,"
# arg2 = '''jacobs is the problem - or, at least, the public face of the problem.'''

# for type in ["mask-fill", "mask-fill-with-marker-symbol", "free-insert", "free-insert-co-occur"]:
    
#     print(f"*************** {type} ***************")
#     system_prompt = prepare_system_prompt(type)
#     user_prompt = prepare_user_prompt(arg1, conn1, arg2, type)

#     # with open(f"samples/{type}.txt", "w") as f:
#     #     f.write(system_prompt + "\n\n" + user_prompt)
#     print(system_prompt + "\n\n" + user_prompt)
#     print("\n")

In [None]:
client = openai.OpenAI(
    api_key="",

)

def get_response (arg1, conn1, arg2, type):
    response = client.chat.completions.create(
        model="gpt-4.1-2025-04-14",  # "gpt-4o-2024-08-06"
        messages = [
            {
                "role": "developer",
                "content": prepare_system_prompt(type)
            },
            {
                "role": "user",
                "content": prepare_user_prompt(arg1, conn1, arg2, type)
            }
        ]
    )
    return response.choices[0].message.content

def get_dm_for_test_set (df, type):
    result = []
    for i, r in tqdm(df.iterrows(), total=df.shape[0]):
        sentence1, sentence2, dm1 = r["sentence1"], r["sentence2"], r["dm1"]
        response = get_response(sentence1, sentence2, dm1, type)
        response = response.split(", ")
        result.append(response)
    return result

def format_dm(xss):
    return [[x.lower().strip() for x in xs] for xs in xss]

# Load Test Set

In [174]:
folder = "../../dataset"
os.chdir(folder)

In [175]:
test_set = pd.read_csv("../dataset/explicit-explicit/discovery/dm1_pdtb/test.csv")
test_set

Unnamed: 0,id,sentence1,sentence2,dm1,dm2
0,1502,You don't know if it's true or not as a matter...,no one should judge and say anything and let t...,however,therefore
1,2992,Bloomberg's sensitivity to the climate issue w...,climate change had the last laugh at our polit...,"in the end,","however,"
2,1588,If you recall this infomercial from December 2...,it's been a tough year for tv's pitchmen.,"overall,","though,"
3,2363,"If having specific fonts and layouts matter, P...",it's what you say and not how you say it.,"ultimately,","though,"
4,6241,Onclick attribute of an anchor that goes to th...,"return false, to avoid any confusion.","anyway,",and
...,...,...,...,...,...
253,1798,A device to illustrate silver can be considere...,i wouldn't want to start such a conversation r...,"although,","clearly,"
254,6206,Then I give em 60 days during which I'm not go...,i've found you've got to play a little hardball.,but,"after that,"
255,1430,About the bus ride there.,"this is the worst for me, realizing that somet...","or,",and
256,455,It has been suggested Horncastle was a storage...,why the need for such strong walls?,"again,","though,"


In [182]:
folder = "../model/prompt"
os.chdir(folder)

# Cloze

In [184]:
mask_fill = get_dm_for_test_set(test_set, 
                                "mask-fill")

100%|██████████| 258/258 [03:03<00:00,  1.41it/s]


In [185]:
mask_fill = format_dm(mask_fill)
test_set["cloze"] = mask_fill

In [None]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["cloze"], axis=1)])/ len(test_set)

0.32558139534883723

# Cloze DM marked

In [187]:
cloze_dm_marked = get_dm_for_test_set(test_set, 
                                "mask-fill-with-marker-symbol")

100%|██████████| 258/258 [03:30<00:00,  1.22it/s]


In [None]:
cloze_dm_marked = format_dm(cloze_dm_marked)
test_set["cloze_dm_marked"] = cloze_dm_marked

In [None]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["cloze_dm_marked"], axis=1)])/len(test_set)

# Free Insert

In [None]:
free_insert = get_dm_for_test_set(test_set, 
                                "free-insert")

In [198]:
free_insert = format_dm(free_insert)
test_set["free_insert"] = free_insert

In [207]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["free_insert"], axis=1)])/len(test_set)

0.34108527131782945

# Load Test Set with Predictions

In [None]:
# Save as Pickle as it contains List DataType
model_name = ""
data_split = ""
test_set.to_pickle(f"{model_name}_{data_split}_result.pkl")

# Load Test Set with Predictions

In [None]:
df = pd.read_pickle("")
df

# View Test Set

In [None]:
HTML(test_set.to_html())