In [1]:
import pandas as pd
from IDRR_data import *

In [20]:
data_path = r'/data/whsun/idrr/data/raw/pdtb2.p1.csv'
df = IDRRDataFrames(
    data_name='pdtb2',
    data_level='second',
    data_relation='Implicit',
    data_path=data_path,
)

In [21]:
df.train_df.columns

Index(['arg1', 'arg2', 'conn1', 'conn2', 'conn1sense1', 'conn1sense2',
       'conn2sense1', 'conn2sense2', 'relation', 'split', 'Section',
       'FileNumber', 'label11', 'label11id', 'label12', 'label12id', 'label21',
       'label21id', 'label22', 'label22id', 'ans_word1', 'ans_word1id',
       'ans_word2', 'ans_word2id'],
      dtype='object')

In [22]:
from collections import Counter
Counter(df.train_df['label11'])

Counter({'Contingency.Cause': 3227,
         'Expansion.Conjunction': 2805,
         'Expansion.Restatement': 2376,
         'Comparison.Contrast': 1566,
         'Expansion.Instantiation': 1061,
         'Temporal.Asynchronous': 517,
         'Expansion.List': 330,
         'Comparison.Concession': 180,
         'Temporal.Synchrony': 147,
         'Expansion.Alternative': 146,
         'Contingency.Pragmatic cause': 51})

#### parquet格式化数据 (以qwen3为例)

In [5]:
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

prompt_tmeplate = read_txt("/data/whsun/idrr/prompts/rl_base.txt")
prompt_tmeplate

'### Task\nYou are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \\boxed{{}}.\n\n### Relations\n{relation_terms}\n\n### Segments\nText segment 1: {arg1}\nText segment 2: {arg2}\n\nYour answer:\n'

In [None]:
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/data/whsun/pretrained_models/Qwen/Qwen3-0.6B")

def get_rl_data(data_source: str, df, label_list):
    rl_data = []
    relation_terms = '\n'.join([f"{chr(65 + i)}. {label}" for i, label in enumerate(label_list)])
    label2alpha = {label: chr(65 + i) for i, label in enumerate(label_list)}
    for index, row in df.iterrows():
        prompt = prompt_tmeplate.format(
            relation_terms=relation_terms,
            arg1=row['arg1'],
            arg2=row['arg2'],
        )
        grounth_truth_alpha = label2alpha[row["label11"]]
        messages = [
            {"role": "user", "content": prompt}
        ]
        prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        rl_data.append(
            {
                "data_source": data_source,
                "prompt": [{"content": prompt_text, "role": "user"}],
                "reward_model": {"ground_truth": grounth_truth_alpha},
            }
        )
    return Dataset.from_list(rl_data, split="train")

train_rl_dataset = get_rl_data("pdtb", df.train_df, df.label_list)
train_rl_dataset[0]

{'data_source': 'pdtb',
 'prompt': [{'content': '<|im_start|>user\n### Task\nYou are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \\boxed{}.\n\n### Relations\nA. Comparison\nB. Contingency\nC. Expansion\nD. Temporal\n\n### Segments\nText segment 1: In an Oct. 19 review of "The Misanthrope" at Chicago\'s Goodman Theatre ("Revitalized Classics Take the Stage in Windy City," Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag\nText segment 2: Ms. Haag plays Elianti\n\nYour answer:\n<|im_end|>\n<|im_start|>assistant\n',
   'role': 'user'}],
 'reward_model': {'ground_truth': 'A'}}

In [11]:
dev_rl_dataset = get_rl_data("pdtb", df.dev_df, df.label_list)
test_rl_dataset = get_rl_data("pdtb", df.test_df, df.label_list)
dev_rl_dataset, test_rl_dataset

(Dataset({
     features: ['data_source', 'prompt', 'reward_model'],
     num_rows: 1183
 }),
 Dataset({
     features: ['data_source', 'prompt', 'reward_model'],
     num_rows: 1046
 }))

In [12]:
train_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/qwen3_train.parquet")
dev_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/qwen3_dev.parquet")
test_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/qwen3_test.parquet")

Creating parquet from Arrow format: 100%|██████████| 13/13 [00:00<00:00, 80.08ba/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 496.96ba/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 543.34ba/s]


712503

#### 将label转化为关系定义进行sft
1. 将二级label转为关系定义
2. 模型1：论元对 -> 关系定义
3. 模型2：模型1给出文本 -> label

In [8]:
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content
prompt_tmeplate = read_txt("/data/whsun/idrr/prompts/arg2def.txt")
prompt_tmeplate

'Briefly describe the relationship between two arguments.\nArg1: {arg1}\nArg2: {arg2}'

In [27]:
sense2df = {
    "Temporal": "The situations described in the arguments are related temporally.",
    "Temporal.Asynchronous": "One envent is described as preceding the other.",
    "Temporal.Synchrony": "There is some degree of temporal overlap between the events described by the arguments.",
    "Contingency": "One of the situations described in Arg1 and Arg2 causally influences the other.",
    "Contingency.Cause":"The situations described in the arguments are causally influenced and the two are not in a conditional relation.",
    "Contingency.Pragmatic cause":"Arg1 expresses a claim and Arg2 provides justification for this claim.",
    "Contingency.Condition": "The situation in Arg2 is taken to be the condition and the situation described in Arg1 is taken to be the consequence.",
    "Contingency.Pragmatic condition": "Used for instances of conditional constructions whose interpretation deviates from that of the semantics of “Condition”.",
    "Comparison": "A discourse relation is established between Arg1 and Arg2 in order to highlight prominent differences between the two situations.",
    "Comparison.Contrast": "Arg1 and Arg2 share a predicate or property and a difference is highlighted with respect to the values assigned to the shared property.",
    "Comparison.Pragmatic contrast": "A contrast between one of the arguments and an inference that can be drawn from the other, in many cases at the speech act level: The contrast is not between the situations described in Arg1 and Arg2.",
    "Comparison.Concession": "One argument denotes a fact that triggers a set of potential consequences, while the other denies one or more of them.",
    "Comparison.Pragmatic concession": "One argument denotes a fact that triggers a set of potential consequences, while the other denies one or more of them. The denial is not at the level of the situations described in Arg1 and Arg2, but rather at the level of inferences that can be drawn from them.",
    "Expansion": "Expanding the discourse and move its narrative or exposition forward.",
    "Expansion.Conjunction":"The situation described in Arg2 provides additional, discourse new, information that is related to the situation described in Arg1, but is not related to Arg1 in any of the ways described for other types of “EXPANSION”.",
    "Expansion.Instantiation":"Arg1 evokes a set and Arg2 describes it in further detail, It may be a set of events, a set of reasons, or a generic set of events, behaviors, attitudes, etc.",
    "Expansion.Restatement":"The semantics of Arg2 restates the semantics of Arg1. It is inferred that the situations described in Arg1 and Arg2 hold true at the same time.",
    "Expansion.Alternative":"Two arguments denote alternative situations.",
    "Expansion.Exception":"Arg2 specifies an exception to the generalization specified by Arg1. In other words, Arg1 is false because Arg2 is true, but if Arg2 were false, Arg1 would be true.",
    "Expansion.List":"Arguments are members of a list, defined in the prior discourse.“List”does not require the situations specified in Arg1 and Arg2 to be directly related."
}
def write_json(file, data):
    import json
    import os
    os.makedirs(os.path.dirname(file), exist_ok=True)
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"Data saved to {file}")

def write_alpaca_format(df, file_path):
    alpaca_data = []
    for _, row in df.iterrows():
        prompt = prompt_tmeplate.format(
            arg1=row['arg1'],
            arg2=row['arg2'],
        )
        sense = row['conn1sense1']
        sense_lst = sense.split('.')
        if len(sense_lst) > 1:
            sense = '.'.join(sense_lst[:2])
        alpaca_data.append(
            {
                "instruction": prompt,
                "input": "",
                "output": sense2df[sense] + f' Relation: {sense}' if len(df) > 9999 else sense,
            }
        )
    write_json(file_path, alpaca_data)

write_alpaca_format(df.train_df, "/data/whsun/idrr/data/arg2def/pdtb2/aplaca/train.json")
write_alpaca_format(df.test_df, "/data/whsun/idrr/data/arg2def/pdtb2/aplaca/test.json")

Data saved to /data/whsun/idrr/data/arg2def/pdtb2/aplaca/train.json
Data saved to /data/whsun/idrr/data/arg2def/pdtb2/aplaca/test.json
