In [1]:
import pandas as pd
from IDRR_data import *

In [2]:
data_path = r'/data/whsun/idrr/data/raw/pdtb2.p1.csv'
df = IDRRDataFrames(
    data_name='pdtb2',
    data_level='top',
    data_relation='Implicit',
    data_path=data_path,
)

In [3]:
df.train_df.columns

Index(['arg1', 'arg2', 'conn1', 'conn2', 'conn1sense1', 'conn1sense2',
       'conn2sense1', 'conn2sense2', 'relation', 'split', 'Section',
       'FileNumber', 'label11', 'label11id', 'label12', 'label12id', 'label21',
       'label21id', 'label22', 'label22id', 'ans_word1', 'ans_word1id',
       'ans_word2', 'ans_word2id'],
      dtype='object')

In [4]:
df.train_df.iloc[0]

arg1           In an Oct. 19 review of "The Misanthrope" at C...
arg2                                      Ms. Haag plays Elianti
conn1                                                    however
conn2                                                        NaN
conn1sense1                    Comparison.Contrast.Juxtaposition
conn1sense2                                                  NaN
conn2sense1                                                  NaN
conn2sense2                                                  NaN
relation                                                Implicit
split                                                      train
Section                                                        2
FileNumber                                                     0
label11                                               Comparison
label11id                                                      0
label12                                                     <NA>
label12id                

#### parquet格式化数据 (以qwen3为例)

In [5]:
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

prompt_tmeplate = read_txt("/data/whsun/idrr/prompts/rl_base.txt")
prompt_tmeplate

'### Task\nYou are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \\boxed{{}}.\n\n### Relations\n{relation_terms}\n\n### Segments\nText segment 1: {arg1}\nText segment 2: {arg2}\n\nYour answer:\n'

In [None]:
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/data/whsun/pretrained_models/Qwen/Qwen3-0.6B")

def get_rl_data(data_source: str, df, label_list):
    rl_data = []
    relation_terms = '\n'.join([f"{chr(65 + i)}. {label}" for i, label in enumerate(label_list)])
    label2alpha = {label: chr(65 + i) for i, label in enumerate(label_list)}
    for index, row in df.iterrows():
        prompt = prompt_tmeplate.format(
            relation_terms=relation_terms,
            arg1=row['arg1'],
            arg2=row['arg2'],
        )
        grounth_truth_alpha = label2alpha[row["label11"]]
        messages = [
            {"role": "user", "content": prompt}
        ]
        prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        rl_data.append(
            {
                "data_source": data_source,
                "prompt": [{"content": prompt_text, "role": "user"}],
                "reward_model": {"ground_truth": grounth_truth_alpha},
            }
        )
    return Dataset.from_list(rl_data, split="train")

train_rl_dataset = get_rl_data("pdtb", df.train_df, df.label_list)
train_rl_dataset[0]

{'data_source': 'pdtb',
 'prompt': [{'content': '<|im_start|>user\n### Task\nYou are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \\boxed{}.\n\n### Relations\nA. Comparison\nB. Contingency\nC. Expansion\nD. Temporal\n\n### Segments\nText segment 1: In an Oct. 19 review of "The Misanthrope" at Chicago\'s Goodman Theatre ("Revitalized Classics Take the Stage in Windy City," Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag\nText segment 2: Ms. Haag plays Elianti\n\nYour answer:\n<|im_end|>\n<|im_start|>assistant\n',
   'role': 'user'}],
 'reward_model': {'ground_truth': 'A'}}

In [11]:
dev_rl_dataset = get_rl_data("pdtb", df.dev_df, df.label_list)
test_rl_dataset = get_rl_data("pdtb", df.test_df, df.label_list)
dev_rl_dataset, test_rl_dataset

(Dataset({
     features: ['data_source', 'prompt', 'reward_model'],
     num_rows: 1183
 }),
 Dataset({
     features: ['data_source', 'prompt', 'reward_model'],
     num_rows: 1046
 }))

In [12]:
train_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/qwen3_train.parquet")
dev_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/qwen3_dev.parquet")
test_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/qwen3_test.parquet")

Creating parquet from Arrow format: 100%|██████████| 13/13 [00:00<00:00, 80.08ba/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 496.96ba/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 543.34ba/s]


712503