- Explicit-Explicit Discovery: predict Arg2 sent-initial connective
- Explicit-Explicit PDTB: predict subsequent connective
- Explicit-Implicit: predict implicit 

In [1]:
import openai
import pandas as pd
import os
from tqdm import tqdm
from IPython.display import display, HTML
import re
import json

In [None]:
x = '''
If you recall this infomercial from December 2008, you probably remember it for lame double entendres like "You're gonna love my nuts."
'''
should_capitalize = bool(re.search(r'[.!?]\s*["\']?\s*$', x))
should_capitalize

In [None]:
introduce_marker_symbol = '''\n  - Take note of the existing discourse marker enclosed by the character [ and ]'''

def prepare_system_prompt(type):
    prompt_file = ""
    if type == "free-insert":
        prompt_file = "free-insert_system_prompt.txt"
    elif type == "free-insert-co-occur":
        prompt_file = "free-insert-coo_system_prompt.txt"

    elif type == "mask-fill" or type == "mask-fill-with-marker-symbol":
        prompt_file = "mask-fill_system_prompt.txt"

    res = ""
    with open(prompt_file) as f:
        res = f.read()
    if "with-marker-symbol" in type:
        res = res.replace("<introduce_marker_symbol>", introduce_marker_symbol).replace("are curly and ___ are more", "are curly [and] ___ are more")
    else:
        res = res.replace("<introduce_marker_symbol>", "")
    return res

def make_blank_for_prompt(arg1, conn1, arg2, marker_symbol=True, free_insert=False, after=True):
    res = ""
    # conn1
    if marker_symbol:
        res = " [" + conn1 + "] "
    else:
        res = " " + conn1 + " "

    if free_insert:
        res = arg1 + res + arg2
    else:
        if after:
            res = arg1 + res + "___ " + arg2
        else:
            res = arg1 + " ___" + res + arg2

    return res


def prepare_user_prompt(arg1, conn1, arg2, type, after=True):
    res = ""
    with open("user_prompt.txt") as f:
        res = f.read()

    should_capitalize = bool(re.search(r'[.!?]\s*["\']?\s*$', arg1))
    conn1 = conn1.capitalize() if should_capitalize else conn1 # If arg1 is a complete sentence, conn1 is sentence-initial of arg2.

    if type == "free-insert" or type == "free-insert-co-occur":
        passage = make_blank_for_prompt(arg1, conn1, arg2, free_insert = True)
        res = res.replace("<passage>", passage)
    elif type == "mask-fill-with-marker-symbol":
        conn1 = conn1.lower() if not after else conn1 # If a blank is inserted before conn1, force it to lowercase
        passage = make_blank_for_prompt(arg1, conn1, arg2, after=after)
        res = res.replace("<passage>", passage)
    elif type == "mask-fill":
        conn1 = conn1.lower() if not after else conn1 # If a blank is inserted before conn1, force it to lowercase
        passage = make_blank_for_prompt(arg1, conn1, arg2, marker_symbol=False, after=after)
        res = res.replace("<passage>", passage)
    return res


In [None]:
## CREATE PROMPT SAMPLE

arg1 = '''Jacobs may have gotten a temporary reprieve in the eyes of many rank-and-file fans with the hiring of Coach Malzahn, who has made all the right moves so far.'''
conn1 = "ultimately,"
arg2 = '''jacobs is the problem - or, at least, the public face of the problem.'''

for type in ["mask-fill", "mask-fill-with-marker-symbol", "free-insert"]:
    
    print(f"*************** {type} ***************")
    system_prompt = prepare_system_prompt(type)
    user_prompt = prepare_user_prompt(arg1, conn1, arg2, type)

    # with open(f"samples/{type}.txt", "w") as f:
    #     f.write(system_prompt + "\n\n" + user_prompt)
    print(system_prompt + "\n\n" + user_prompt)
    print("\n")

In [None]:
client = openai.OpenAI(
    api_key="",

)

def get_response (arg1, conn1, arg2, type):
    response = client.chat.completions.create(
        model="gpt-4.1-2025-04-14",  # "gpt-4o-2024-08-06", "gpt-4.1-2025-04-14"
        messages = [
            {
                "role": "developer",
                "content": prepare_system_prompt(type)
            },
            {
                "role": "user",
                "content": prepare_user_prompt(arg1, conn1, arg2, type)
            }
        ]
    )
    # print(prepare_system_prompt(type))
    # print(prepare_user_prompt(arg1, conn1, arg2, type))
    return response.choices[0].message.content

def get_dm_for_test_set (df, type):
    result = []
    for i, r in tqdm(df.iterrows(), total=df.shape[0]):
        sentence1, sentence2, dm1 = r["sentence1"], r["sentence2"], r["dm1"]
        response = get_response(sentence1, dm1, sentence2, type)
        response = response.split(", ")
        result.append(response)
    return result

def format_dm(xss):
    return [[x.lower().strip() for x in xs] for xs in xss]

# Load Test Set

In [None]:
!pwd

In [None]:
folder = "../../dataset"
os.chdir(folder)

In [None]:
test_set = pd.read_csv("../dataset/explicit-explicit/discovery/dm1_other/test.csv")
test_set

In [None]:
folder = "../model/prompt"
os.chdir(folder)

# Cloze

In [None]:
mask_fill = get_dm_for_test_set(test_set, 
                                "mask-fill")

In [None]:
mask_fill = format_dm(mask_fill)
test_set["cloze"] = mask_fill

In [None]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["cloze"], axis=1)])/ len(test_set)

# Cloze DM marked

In [None]:
cloze_dm_marked = get_dm_for_test_set(test_set, 
                                "mask-fill-with-marker-symbol")

In [None]:
cloze_dm_marked = format_dm(cloze_dm_marked)
test_set["cloze_dm_marked"] = cloze_dm_marked

In [None]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["cloze_dm_marked"], axis=1)])/len(test_set)

# Free Insert

In [None]:
free_insert = get_dm_for_test_set(test_set, 
                                "free-insert")

In [None]:
free_insert = format_dm(free_insert)
test_set["free_insert"] = free_insert

In [None]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["free_insert"], axis=1)])/len(test_set)

In [None]:
t = test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") not in r["free_insert"], axis=1)]
HTML(t.to_html())

# Save Test Set with Predictions

In [None]:
# Save as Pickle as it contains List DataType
model_name = "gpt4.1"
data_split = "discovery_other_dm"
test_set.to_pickle(f"{model_name}_{data_split}_result.pkl")

# Load Test Set with Predictions

In [3]:
folder = "../../"
os.chdir(folder)

In [4]:
!pwd

/Users/davidliu/co-occur_dm


In [5]:
df = pd.read_pickle("model/prompt/result/gpt4o_discovery_other_dm_result.pkl")
df

Unnamed: 0,id,sentence1,sentence2,dm1,dm2,cloze,cloze_dm_marked,free_insert
0,3282,", and proudly continues that tradition of care...",i have been trying to drag myself out of this ...,"lately,","however,","[however, indeed, admittedly, recently, conseq...","[however, eventually, recently, instead, now]","[recently, currently, additionally, moreover, ..."
1,250,A victory?,i'm interested to see what the next couple of ...,"maybe,",but,"[but, yet, however, still, though]","[perhaps, possibly, potentially, but, still]","[perhaps, possibly, potentially, conceivably, ..."
2,3571,There does appear to be some real intelligence...,you can sort of separate some of the controver...,"secondly,","though,","[then, furthermore, consequently, meanwhile, t...","[additionally, furthermore, moreover, in addit...","[furthermore, additionally, moreover, besides,..."
3,199,"Is it a call to action by Powell, wanting peop...","whatever the case, it's powerful stuff.","perhaps,",but,"[regardless, in any event, anyway, nonetheless...","[in any case, regardless, ultimately, neverthe...","[alternatively, maybe, possibly, conceivably, ..."
4,1935,She then accepted employment in one of the mor...,she did not feel that she deserved this state ...,"apparently,","however,","[however, nevertheless, yet, still, surprisingly]","[however, nevertheless, yet, still, somehow]","[however, furthermore, moreover, yet, surprisi..."
...,...,...,...,...,...,...,...,...
927,6326,At least last night they started the show at t...,when the game's over and you're risking cuttin...,"really,","though,","[especially, typically, usually, particularly,...","[i mean, basically, ultimately, essentially, i...","[honestly, actually, frankly, indeed, truthfully]"
928,5705,Note that the eBook versions are typically ava...,the price is the same for both formats (save e...,"absolutely,",and,"[however, nonetheless, importantly, significan...","[in fact, indeed, furthermore, additionally, m...","[indeed, furthermore, additionally, importantl..."
929,3100,"In the good old days, which weren't actually a...",the chef shortage has turned his recruiting pr...,"lately,","though,","[however, unfortunately, though, nevertheless,...","[however, unfortunately, recently, increasingl...","[however, recently, consequently, furthermore,..."
930,296,But will that be Djokovic?,a djokovic in what condition?,"probably,",but,"[possibly, conceivably, perhaps, arguably, maybe]","[then, of course, however, still, perhaps]","[perhaps, maybe, possibly, conceivably, arguably]"


# Sense Evaluation

In [6]:
with open('dataset/explicit_connective_senses.json', 'r') as f:
    explicit_sense = json.load(f)
def get_senses_for_conns (conns):
    senses = set()
    for conn in conns:
        senses.update(explicit_sense.get(conn, {}).keys())
    return list(senses)

def sense_accuracy(df, B, A="dm2_sense"):
    return sum(bool(set(a) & set(b)) for a, b in zip(df[A], df[B])) / len(df)

In [None]:
df["dm2_sense"] = df["dm2"].apply(lambda conn: get_senses_for_conns([conn.rstrip(", ")]))

df["cloze_sense"] = df["cloze"].apply(lambda conns: get_senses_for_conns(conns))
df["cloze_dm_marked_sense"] = df["cloze_dm_marked"].apply(lambda conns: get_senses_for_conns(conns))
df["free_insert_sense"] = df["free_insert"].apply(lambda conns: get_senses_for_conns(conns))

In [None]:
sense_accuracy(df, "cloze_sense"), sense_accuracy(df, "cloze_dm_marked_sense"), sense_accuracy(df, "free_insert_sense")

# View Test Set

In [None]:
HTML(test_set.to_html())