# DATASET FOR CURRICULUM LEARNING.

In [2]:
from datasets import load_dataset

import spacy
import benepar
from tqdm import tqdm

data=load_dataset("haeunkim/sentence_tag_multitask_v2")


benepar.download('benepar_en3')

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("benepar", config={"model": "benepar_en3"})

# 3) parse tree 
def get_parse(sentence):
    doc = nlp(sentence)
    
    return list(doc.sents)[0]._.parse_string

# 4) depth
def get_tree_depth(parse_str: str) -> int:
    depth = 0
    max_depth = 0
    for ch in parse_str:
        if ch == "(":
            depth += 1
            max_depth = max(max_depth, depth)
        elif ch == ")":
            depth -= 1
    return max_depth

# 5) hypothesis parse & depth 
parses = []
depths = []

for sent in tqdm(data["train"]["input"]):  
    parse_str = get_parse(sent)
    parses.append(parse_str)
    depths.append(get_tree_depth(parse_str))

# 6) adding columns to train dataset
data["train"] = data["train"].add_column("parse", parses)
data["train"] = data["train"].add_column("hypo_depth", depths)
parses = []
depths = []

for sent in tqdm(data["test"]["input"]): 
    parse_str = get_parse(sent)
    parses.append(parse_str)
    depths.append(get_tree_depth(parse_str))

# 6) adding columns to test dataset
data["test"] = data["test"].add_column("parse", parses)
data["test"] = data["test"].add_column("hypo_depth", depths)

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\haeun\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


KeyboardInterrupt: 

In [None]:
for split,ds in old_data.items():
    print(split)

In [None]:
#i correct the wrong instruction 
from datasets import DatasetDict
DC_TEXT = "Extract all DC clauses(dependent clauses) in the sentence. Return a list of clause spans. If no such clause exists, return []."
def rewrite_instr(ds):
    return ds.map(
        lambda ex, idx: {"instruction": DC_TEXT if (idx % 5 == 1) else ex["instruction"]},
        with_indices=True,
        load_from_cache_file=False  
    )

updated = DatasetDict({split: rewrite_instr(ds) for split, ds in old_data.items()})
print(updated)

In [None]:
import matplotlib.pyplot as plt

plt.hist(depths , bins=30, color="skyblue", edgecolor="black")
plt.xlabel("Hypothesis Parse Tree Depth")
plt.ylabel("Count")
plt.title("qnli train - Hypothesis Depth Distribution")
plt.show()
low = data["train"].filter(lambda x: x["hypo_depth"] <= 9)
medium = data["train"].filter(lambda x: 10 <= x["hypo_depth"] <= 14)
high = data["train"].filter(lambda x: x["hypo_depth"] >= 15)


In [None]:
import pandas as pd
from datasets import Dataset,DatasetDict
test=pd.read_csv("test_with_parse_depth.csv",encoding='utf-8-sig')
test = Dataset.from_pandas(test)
test_low = test.filter(lambda x: x["hypo_depth"] <= 9)
test_medium =test.filter(lambda x: 10 <= x["hypo_depth"] <= 14)
test_high = test.filter(lambda x: x["hypo_depth"] >= 15)
test_dist = test['hypo_depth']
plt.hist(test_dist , bins=30, color="skyblue", edgecolor="black")
plt.xlabel("Hypothesis Parse Tree Depth")
plt.ylabel("Count")
plt.title("qnli test- Hypothesis Depth Distribution")
plt.show()
dataset_dict = DatasetDict({
    "train_low": low,
    "train_medium": medium,
    "train_high": high,
    "test": test
})
dataset_dict.push_to_hub("haeunkim/curriculum_learning")

In [10]:
d=load_dataset("glue","qnli",split='train')

In [16]:
string='When talking about the German language, the term German dialects is only used for the traditional regional varieties.'
string in d['sentence']

True

In [None]:
from datasets import concatenate_datasets
low=updated['train_low']
medium=updated['train_medium']
high=updated['train_high']
train_all = concatenate_datasets([low, medium, high])
print(len(train_all))  # 총 데이터 개수 확인
test=updated['test']

In [None]:
len(low),len(medium),len(high)

In [None]:
low = train_all.filter(lambda x: x['hypo_depth'] <= 8)
medium = train_all.filter(lambda x: 9 <= x['hypo_depth'] <= 11)
high = train_all.filter(lambda x: x['hypo_depth'] >= 12)
len(low)+len(medium)+len(high),len(test)

In [None]:
dataset_dict = DatasetDict({
    "train_low": low,
    "train_medium": medium,
    "train_high": high,
    "test": test
})
dataset_dict.push_to_hub("haeunkim/curriculum_learning")

In [None]:
import ast
import random

def _fix_orphan_quotes(s: str) -> str:
    s = "" if s is None else str(s).strip()
    if not s:
        return s
    
    for q in ['"', "'"]:
        starts = s.startswith(q)
        ends   = s.endswith(q)
        if starts and not ends:
            return s[1:].lstrip()
        if ends and not starts:
            return s[:-1].rstrip()
    return s

def parse_chosen_rejected(raw) -> str:
   
    if isinstance(raw, list):
        cleaned = [_fix_orphan_quotes(x) for x in raw]
        return str(cleaned)

    
    s = "" if raw is None else str(raw).strip()
    if s.startswith("[") and s.endswith("]"):
        try:
            obj = ast.literal_eval(s)
            if isinstance(obj, list):
                cleaned = [_fix_orphan_quotes(x) for x in obj]
                return str(cleaned)
        except Exception:
            pass

    
    return _fix_orphan_quotes(s)

def parse_output(raw: str) -> str:
    
    s = _fix_orphan_quotes(raw)
    
    try:
        
        if isinstance(s, list):
            return ",".join(str(x).strip() for x in s)
        
        return str(s).strip()
    except Exception:
        return s

    
#df = pd.read_csv("train_no_dup.csv", encoding='utf-8')
def create_curriculum_dpo(df):
    dpo_data = []
    for input_text, group in df.groupby("input"):
        
        for _, row in group.iterrows():
            if len(input_text.strip()) <= 3:
                continue 
            sentence=parse_chosen_rejected(row['input'])
            instruction=row['instruction']
            prompt = f"INSTRUCTION: {instruction.strip()}\nINPUT: {sentence.strip()}"

            chosen = parse_chosen_rejected(row["output"])
            
            candidates = group[group["instruction"] != row["instruction"]]
            negatives = candidates[candidates["output"].apply(lambda x: parse_chosen_rejected(x) != chosen)]

           
            if negatives.empty:
                negatives = candidates

            if negatives.empty:
                continue

            rejected = parse_chosen_rejected(random.choice(negatives["output"].tolist()))
            
            dpo_data.append({
                "prompt": prompt,
                "instruction":instruction,
                "input":sentence,
                "chosen": chosen,
                "rejected": rejected
            })
    return dpo_data

In [247]:
from datasets import load_dataset,DatasetDict,Dataset, Features, Value
import pandas as pd
dataset=load_dataset("haeunkim/curriculum_learning")
dataset

DatasetDict({
    train_low: Dataset({
        features: ['instruction', 'input', 'output', 'parse', 'hypo_depth'],
        num_rows: 99450
    })
    train_medium: Dataset({
        features: ['instruction', 'input', 'output', 'parse', 'hypo_depth'],
        num_rows: 99195
    })
    train_high: Dataset({
        features: ['instruction', 'input', 'output', 'parse', 'hypo_depth'],
        num_rows: 96110
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'parse', 'hypo_depth'],
        num_rows: 30670
    })
})

In [267]:
def replace_special_tokens(text):
    if not isinstance(text, str):
        return text
    return (text.replace("-LRB-", "(").replace("-RRB-", ")")
                .replace("-LSB-", "[").replace("-RSB-", "]")
                .replace("-LCB-", "{").replace("-RCB-", "}"))

In [415]:
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split

df_low = dataset['train_low'].to_pandas()
df_me = dataset['train_medium'].to_pandas()
df_high = dataset['train_high'].to_pandas()

df_test=dataset['test'].to_pandas()

for df_ in (df_low, df_me, df_high, df_test):
    for col in ['input', 'output']:
        if col in df_:
            df_[col] = df_[col].apply(replace_special_tokens)
    

inputs = df_test['input'].tolist()
seen = set()
unique_inputs = [x for x in inputs if not (x in seen or seen.add(x))]

# eval/test 비율 설정 (예: 50% / 50%)
eval_inputs, test_inputs = train_test_split(unique_inputs, test_size=0.7, random_state=42)    
eval_df = df_test[df_test['input'].isin(eval_inputs)].copy()
test_df = df_test[df_test['input'].isin(test_inputs)].copy()

dataset_low=create_curriculum_dpo(df_low)
dataset_me=create_curriculum_dpo(df_me)    
dataset_high=create_curriculum_dpo(df_high)
dataset_test=create_curriculum_dpo(test_df)   
dataset_eval=create_curriculum_dpo(eval_df)



In [416]:
eval_dpo_low = Dataset.from_pandas(pd.DataFrame(dataset_low))
eval_dpo_medium = Dataset.from_pandas(pd.DataFrame(dataset_me))
eval_dpo_high = Dataset.from_pandas(pd.DataFrame(dataset_high))
eval=Dataset.from_pandas(pd.DataFrame(dataset_eval))
test=Dataset.from_pandas(pd.DataFrame(dataset_test))
# 새로운 데이터셋 딕셔너리 생성
new_dataset = DatasetDict({
    'train_low':eval_dpo_low,
    'train_medium':eval_dpo_medium,
    'train_high':eval_dpo_high,
    'eval': eval ,
    'test': test
})


print(f"Eval DPO samples: {len(eval)}")
print(f"Test DPO samples: {len(test)}")



Eval DPO samples: 9200
Test DPO samples: 21465


In [417]:
import random
import json
#negative sample

def left_trim(span: str):
    toks = span.split()
    return " ".join(toks[1:]) if len(toks) >= 2 else ""

def right_trim(span: str):
    toks = span.split()
    return " ".join(toks[:-1]) if len(toks) >= 2 else ""

def both_trim(span: str):
    toks = span.split()
    return " ".join(toks[1:-1]) if len(toks) >= 3 else ""
def get_negative_sample(sentence,chosen,reject,mode_weights=None):
    modes, weights = zip(*mode_weights.items())
    choice = random.choices(modes, weights, k=1)[0]
    
    
   
    len_sentence=len(sentence)
    if choice == "sentence_repeat":
        
        repeat =random.randint(0,5)
        rejected=[sentence] * repeat
        
        return json.dumps(rejected, ensure_ascii=False)
        
    elif choice == "random":
        
        start =random.randint(0,len_sentence)
        end =random.randint(0,len_sentence)
        rejected={"start": start, "end": end}
        return json.dumps(rejected, ensure_ascii=False)
        
    elif choice == "random_span":
        repeat =random.randint(0,5)
        rejected=[]
        for i in range(repeat):
            start =random.randint(0,len_sentence)
            end =random.randint(start,len_sentence)  
            rejected.append(sentence[start:end])  
        out = json.dumps(rejected, ensure_ascii=False) if rejected else "[]"
        return out
    elif choice == "broken":
        rejected=[]
        kind = random.choice(["broken_left", "broken_right"])
        start =random.randint(0,len_sentence+1)
        end =random.randint(start,len_sentence+1)
        frag = sentence[start:end]  
        if kind =='broken_right':
           rejected.append(f'["{frag}')
        else:
            rejected.append(f'{frag}"]')
        return json.dumps(rejected, ensure_ascii=False) if rejected else "[]"
    elif choice=="trim":
        rejected=[]
        for gold in chosen:
            rn=random.randint(0,2)
            if rn==0:
                corrupted = left_trim(gold)
                rejected.append(corrupted)
            elif rn==1:
                corrupted =right_trim(gold)
                rejected.append(corrupted)
            else:
                corrupted = both_trim(gold)
                rejected.append(corrupted)
            return json.dumps(rejected, ensure_ascii=False) if rejected else "[]"
    elif choice=='original':
        
        return json.dumps(reject, ensure_ascii=False) if reject else "[]"
        
    
    

In [418]:
def add_negative_row(example):
    
    rejected = get_negative_sample(
            sentence=example["input"],
            chosen=example["chosen"],
            reject=example['rejected'],
            mode_weights ={"sentence_repeat":0.05,"trim": 0.05, "random": 0.05,"random_span":0.05,"broken": 0.05,"original":0.75}
            
        )#mode_weights = {"sentence_repeat":0.15,"trim": 0.15, "random": 0.20,"random_span":0.15, "broken": 0.15,"original":0.20}
    return {'rejected':(rejected)}
    

processed_low = eval_dpo_low.map(add_negative_row, desc="Generating negative samples")

processed_medium = eval_dpo_medium.map(add_negative_row, desc="Generating negative samples")

processed_high = eval_dpo_high.map(add_negative_row, desc="Generating negative samples")
processed_eval = eval.map(add_negative_row, desc="Generating negative samples")
processed_test = test.map(add_negative_row, desc="Generating negative samples")


Generating negative samples:   0%|          | 0/99395 [00:00<?, ? examples/s]

Generating negative samples:   0%|          | 0/99195 [00:00<?, ? examples/s]

Generating negative samples:   0%|          | 0/96110 [00:00<?, ? examples/s]

Generating negative samples:   0%|          | 0/9200 [00:00<?, ? examples/s]

Generating negative samples:   0%|          | 0/21465 [00:00<?, ? examples/s]

In [419]:
dataset_dict = DatasetDict({
    "train_low": processed_low,
    "train_medium": processed_medium,
    "train_high": processed_high,
    "eval":processed_eval,
    "test":processed_test
})
new_dataset.push_to_hub("haeunkim/final_dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/97 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/883 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/haeunkim/final_dataset/commit/780a9feba569858109565d271388e31efd3f04af', commit_message='Upload dataset', commit_description='', oid='780a9feba569858109565d271388e31efd3f04af', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
#haeunkim/curriculum_learning -> normal sft
# haeunkim/curriculum_learning_sft_dpo_negative (curriculum+dpo)
# {"sentence_repeat":0.15,"trim": 0.15, "random": 0.20,"random_span":0.15, "broken": 0.15,"original":0.20} -> 
#haeunkim/curriculum_learning_dpo_negative(dpo only)
#{"sentence_repeat":0.05,"trim": 0.05, "random": 0.05,"random_span":0.05,"broken": 0.05,"original":0.75} ->
  
