- Explicit-Explicit Discovery: predict Arg2 sent-initial connective
- Explicit-Explicit PDTB: predict subsequent connective
- Explicit-Implicit: predict implicit 

In [262]:
import openai
import pandas as pd
import os
from tqdm import tqdm
from IPython.display import display, HTML
import re


In [266]:
x = '''
If you recall this infomercial from December 2008, you probably remember it for lame double entendres like "You're gonna love my nuts."
'''
should_capitalize = bool(re.search(r'[.!?]\s*["\']?\s*$', x))
should_capitalize

True

In [264]:
introduce_marker_symbol = '''\n  - Take note of the existing discourse marker enclosed by the character [ and ]'''

def prepare_system_prompt(type):
    prompt_file = ""
    if type == "free-insert":
        prompt_file = "free-insert_system_prompt.txt"
    elif type == "free-insert-co-occur":
        prompt_file = "free-insert-coo_system_prompt.txt"

    elif type == "mask-fill" or type == "mask-fill-with-marker-symbol":
        prompt_file = "mask-fill_system_prompt.txt"

    res = ""
    with open(prompt_file) as f:
        res = f.read()
    if "with-marker-symbol" in type:
        res = res.replace("<introduce_marker_symbol>", introduce_marker_symbol).replace("are curly and ___ are more", "are curly [and] ___ are more")
    else:
        res = res.replace("<introduce_marker_symbol>", "")
    return res

def make_blank_for_prompt(arg1, conn1, arg2, marker_symbol=True, free_insert=False, after=True):
    res = ""
    # conn1
    if marker_symbol:
        res = " [" + conn1 + "] "
    else:
        res = " " + conn1 + " "

    if free_insert:
        res = arg1 + res + arg2
    else:
        if after:
            res = arg1 + res + "___ " + arg2
        else:
            res = arg1 + " ___" + res + arg2

    return res


def prepare_user_prompt(arg1, conn1, arg2, type, after=True):
    res = ""
    with open("user_prompt.txt") as f:
        res = f.read()

    should_capitalize = bool(re.search(r'[.!?]\s*["\']?\s*$', arg1))
    conn1 = conn1.capitalize() if should_capitalize else conn1 # If arg1 is a complete sentence, conn1 is sentence-initial of arg2.

    if type == "free-insert" or type == "free-insert-co-occur":
        passage = make_blank_for_prompt(arg1, conn1, arg2, free_insert = True)
        res = res.replace("<passage>", passage)
    elif type == "mask-fill-with-marker-symbol":
        conn1 = conn1.lower() if not after else conn1 # If a blank is inserted before conn1, force it to lowercase
        passage = make_blank_for_prompt(arg1, conn1, arg2, after=after)
        res = res.replace("<passage>", passage)
    elif type == "mask-fill":
        conn1 = conn1.lower() if not after else conn1 # If a blank is inserted before conn1, force it to lowercase
        passage = make_blank_for_prompt(arg1, conn1, arg2, marker_symbol=False, after=after)
        res = res.replace("<passage>", passage)
    return res


In [268]:
## CREATE PROMPT SAMPLE

arg1 = '''Jacobs may have gotten a temporary reprieve in the eyes of many rank-and-file fans with the hiring of Coach Malzahn, who has made all the right moves so far.'''
conn1 = "ultimately,"
arg2 = '''jacobs is the problem - or, at least, the public face of the problem.'''

for type in ["mask-fill", "mask-fill-with-marker-symbol", "free-insert"]:
    
    print(f"*************** {type} ***************")
    system_prompt = prepare_system_prompt(type)
    user_prompt = prepare_user_prompt(arg1, conn1, arg2, type)

    # with open(f"samples/{type}.txt", "w") as f:
    #     f.write(system_prompt + "\n\n" + user_prompt)
    print(system_prompt + "\n\n" + user_prompt)
    print("\n")

*************** mask-fill ***************
# Identity

You are a fluent and pragmatic English-speaker. In a given passage, there will be a blank denoted by ___.

# Instructions

  - Read the passage between the characters ```
  - Determine the 5 most likely English discourse markers in place of the blank ___ that is most appropriate for the context; Discourse markers you choose should be words or phrases that express the relation between adjacent parts of the passage. NO DUPLICATES IN THE LIST
  - Your response should only contain the 5 discourse markers separated by comma. It should not have additional formatting or commentary.

# Examples

```
More common chrysotile fibers are curly and ___ are more easily rejected by the body.
```

OUTPUT: 
therefore, subsequently, also, ultimately, often


Now you try:
```
Jacobs may have gotten a temporary reprieve in the eyes of many rank-and-file fans with the hiring of Coach Malzahn, who has made all the right moves so far. Ultimately, ___ jacob

In [None]:
client = openai.OpenAI(
    api_key="",

)

def get_response (arg1, conn1, arg2, type):
    response = client.chat.completions.create(
        model="gpt-4.1-2025-04-14",  # "gpt-4o-2024-08-06", "gpt-4.1-2025-04-14"
        messages = [
            {
                "role": "developer",
                "content": prepare_system_prompt(type)
            },
            {
                "role": "user",
                "content": prepare_user_prompt(arg1, conn1, arg2, type)
            }
        ]
    )
    # print(prepare_system_prompt(type))
    # print(prepare_user_prompt(arg1, conn1, arg2, type))
    return response.choices[0].message.content

def get_dm_for_test_set (df, type):
    result = []
    for i, r in tqdm(df.iterrows(), total=df.shape[0]):
        sentence1, sentence2, dm1 = r["sentence1"], r["sentence2"], r["dm1"]
        response = get_response(sentence1, dm1, sentence2, type)
        response = response.split(", ")
        result.append(response)
    return result

def format_dm(xss):
    return [[x.lower().strip() for x in xs] for xs in xss]

# Load Test Set

In [330]:
folder = "../../dataset"
os.chdir(folder)

In [331]:
test_set = pd.read_csv("../dataset/explicit-explicit/discovery/dm1_other/test.csv")
test_set

Unnamed: 0,id,sentence1,sentence2,dm1,dm2
0,3282,", and proudly continues that tradition of care...",i have been trying to drag myself out of this ...,"lately,","however,"
1,250,A victory?,i'm interested to see what the next couple of ...,"maybe,",but
2,3571,There does appear to be some real intelligence...,you can sort of separate some of the controver...,"secondly,","though,"
3,199,"Is it a call to action by Powell, wanting peop...","whatever the case, it's powerful stuff.","perhaps,",but
4,1935,She then accepted employment in one of the mor...,she did not feel that she deserved this state ...,"apparently,","however,"
...,...,...,...,...,...
927,6326,At least last night they started the show at t...,when the game's over and you're risking cuttin...,"really,","though,"
928,5705,Note that the eBook versions are typically ava...,the price is the same for both formats (save e...,"absolutely,",and
929,3100,"In the good old days, which weren't actually a...",the chef shortage has turned his recruiting pr...,"lately,","though,"
930,296,But will that be Djokovic?,a djokovic in what condition?,"probably,",but


In [332]:
folder = "../model/prompt"
os.chdir(folder)

# Cloze

In [338]:
mask_fill = get_dm_for_test_set(test_set, 
                                "mask-fill")

100%|██████████| 932/932 [09:20<00:00,  1.66it/s]


In [339]:
mask_fill = format_dm(mask_fill)
test_set["cloze"] = mask_fill

In [340]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["cloze"], axis=1)])/ len(test_set)

0.3723175965665236

# Cloze DM marked

In [341]:
cloze_dm_marked = get_dm_for_test_set(test_set, 
                                "mask-fill-with-marker-symbol")

100%|██████████| 932/932 [09:46<00:00,  1.59it/s]


In [342]:
cloze_dm_marked = format_dm(cloze_dm_marked)
test_set["cloze_dm_marked"] = cloze_dm_marked

In [343]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["cloze_dm_marked"], axis=1)])/len(test_set)

0.2575107296137339

# Free Insert

In [344]:
free_insert = get_dm_for_test_set(test_set, 
                                "free-insert")

100%|██████████| 932/932 [09:56<00:00,  1.56it/s]


In [345]:
free_insert = format_dm(free_insert)
test_set["free_insert"] = free_insert

In [346]:
len(test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") in r["free_insert"], axis=1)])/len(test_set)

0.17167381974248927

In [350]:
t = test_set[test_set.apply(lambda r: r["dm2"].lower().rstrip(", ") not in r["free_insert"], axis=1)]
HTML(t.to_html())

Unnamed: 0,id,sentence1,sentence2,dm1,dm2,cloze,cloze_dm_marked,free_insert
1,250,A victory?,i'm interested to see what the next couple of mondays reveal before making that determination.,"maybe,",but,"[however, still, though, nevertheless, yet]","[however, still, nevertheless, though, yet]","[however, still, in fact, nevertheless, thus]"
2,3571,There does appear to be some real intelligence forcing these closures.,you can sort of separate some of the controversies of the nsa.,"secondly,","though,","[thus, moreover, in addition, furthermore, alternatively]","[in addition, furthermore, moreover, likewise, besides]","[furthermore, moreover, in addition, also, additionally]"
3,199,"Is it a call to action by Powell, wanting people to educate themselves on the subject of mental illness and ensure that this won't happen to people in real life?","whatever the case, it's powerful stuff.","perhaps,",but,"[nevertheless, in any case, regardless, still, anyway]","[nevertheless, still, regardless, anyway, in any case]","[however, in any case, nevertheless, moreover, consequently]"
5,686,"That debate is constructive, as is Ben-Tor's contention that ideology is an agent of obscurity and special interests.",anti-ideology becomes an ideology itself.,"inevitably,","though,","[however, paradoxically, in fact, thus, ultimately]","[consequently, thus, in turn, as a result, similarly]","[consequently, furthermore, thus, in fact, moreover]"
6,4822,Our parsha deals with all three roles.,"the most attention-catching is the section on kings, for many reasons.","undoubtedly,","though,","[however, perhaps, arguably, indeed, unsurprisingly]","[however, indeed, perhaps, arguably, certainly]","[in fact, however, moreover, additionally, indeed]"
7,4013,Locational endurance entails multilocation: it has it that some material objects are exactly located at many different regions.,"mereological endurance, which merely rejects temporal parts, does not entail multilocation.","importantly,","however,","[in contrast, however, unlike, by comparison, whereas]","[by contrast, however, in comparison, whereas, conversely]","[furthermore, consequently, similarly, notably, in contrast]"
8,1086,Are there kids getting arrested for being disruptive who have behavioral health needs that must be addressed?,that costs something.,"absolutely,",and,"[however, of course, naturally, obviously, still]","[however, of course, naturally, inevitably, nonetheless]","[indeed, furthermore, in fact, certainly, moreover]"
9,5973,would we have been better with bennett and clayborn with bowers as depth?,"how long can we keep guys as ""depth"" or backups before we lose them.","probably,",but,"[however, besides, still, importantly, after all]","[however, on the other hand, but, still, yet]","[however, moreover, furthermore, in fact, alternatively]"
10,4281,Who is to define their souls??,"is just a maybe, granted that this came true if ever: to plan the path that they will folow according to preestablished criteria is futile.","maybe,",and,"[perhaps, of course, then, after all, indeed]","[however, after all, still, of course, ultimately]","[however, nevertheless, actually, perhaps, instead]"
11,3941,"With Founders Camp, he will be looking for business models that can be taken global.","he has become more focused on family, god, and politics.","increasingly,","however,","[however, meanwhile, additionally, in contrast, instead]","[furthermore, moreover, additionally, in fact, simultaneously]","[meanwhile, moreover, additionally, furthermore, recently]"


# Save Test Set with Predictions

In [347]:
# Save as Pickle as it contains List DataType
model_name = "gpt4.1"
data_split = "discovery_other_dm"
test_set.to_pickle(f"{model_name}_{data_split}_result.pkl")

# Load Test Set with Predictions

In [307]:
df = pd.read_pickle("gpt4o_discovery_pdtb_dm_result.pkl")
df

Unnamed: 0,id,sentence1,sentence2,dm1,dm2,cloze,cloze_dm_marked,free_insert
0,1502,You don't know if it's true or not as a matter...,no one should judge and say anything and let t...,however,therefore,"[ultimately, consequently, accordingly, theref...","[ultimately, therefore, thus, consequently, ac...","[moreover, consequently, therefore, similarly,..."
1,2992,Bloomberg's sensitivity to the climate issue w...,climate change had the last laugh at our polit...,"in the end,","however,","[ironically, indeed, ultimately, unfortunately...","[ironically, ultimately, consequently, eventua...","[ultimately, consequently, eventually, finally..."
2,1588,If you recall this infomercial from December 2...,it's been a tough year for tv's pitchmen.,"overall,","though,","[however, unfortunately, ultimately, clearly, ...","[in any case, undeniably, ultimately, generall...","[in general, consequently, additionally, natur..."
3,2363,"If having specific fonts and layouts matter, P...",it's what you say and not how you say it.,"ultimately,","though,","[however, nevertheless, in the end, ultimately...","[however, nevertheless, still, yet, of course]","[however, indeed, moreover, additionally, none..."
4,6241,Onclick attribute of an anchor that goes to th...,"return false, to avoid any confusion.","anyway,",and,"[just, simply, often, instead, usually]","[simply, instead, alternatively, thus, in fact]","[therefore, thus, alternatively, additionally,..."
...,...,...,...,...,...,...,...,...
253,1798,A device to illustrate silver can be considere...,i wouldn't want to start such a conversation r...,"although,","clearly,","[personally, admittedly, frankly, honestly, ob...","[ultimately, admittedly, however, frankly, nat...","[however, incidentally, on the other hand, non..."
254,6206,Then I give em 60 days during which I'm not go...,i've found you've got to play a little hardball.,but,"after that,","[eventually, ultimately, however, sometimes, n...","[eventually, however, ultimately, nonetheless,...","[however, nonetheless, conversely, alternative..."
255,1430,About the bus ride there.,"this is the worst for me, realizing that somet...","or,",and,"[perhaps, maybe, sometimes, often, usually]","[alternatively, perhaps, maybe, possibly, conv...","[alternatively, however, instead, meanwhile, l..."
256,455,It has been suggested Horncastle was a storage...,why the need for such strong walls?,"again,","though,","[if so, then, thus, hence, but]","[however, nevertheless, nonetheless, still, then]","[furthermore, additionally, moreover, likewise..."


# View Test Set

In [None]:
HTML(test_set.to_html())