# Dataset

In [None]:
from datasets import load_dataset

dataset_name = 'MRPC'
dataset = load_dataset("glue", dataset_name)
# you can use any of the following config names as a second argument:
"ax", "cola", "mnli", "mnli_matched", 
"mnli_mismatched", "mrpc", "qnli", "qqp", 
"rte", "sst2", "stsb", "wnli"

In [None]:
import jsonlines
import os
from tqdm.auto import trange
dataset_name = dataset_name.upper()
splits = dataset.keys()
# split = 'validation'
for split in splits:
    output_path = f"data/{dataset_name}/{split}"
    os.makedirs(output_path, exist_ok=True)
    with jsonlines.open(os.path.join(output_path, 'all.jsonl'), 'w') as jlWriter:
        objs = []
        keys = list(dataset[split].features.keys())
        data_dict = {key:dataset[split][key] for key in keys}
        for i in trange(dataset[split].num_rows):
            objs.append({key:data_dict[key][i] for key in keys})
        jlWriter.write_all(objs)


# Model

In [3]:
for x in range(1, 16):
    url = f"https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-000{x:02d}-of-00015.safetensors"
    print(url)

https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00001-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00002-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00003-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00004-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00005-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00006-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00007-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00008-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00009-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00010-of-00015.safetensors
https://huggingface.co/Qwen/Qwen-14B-Chat/resolve/main/model-00011-of-00015.safetensors
https://huggingface.co/Qwen/Qwen

In [None]:
from huggingface_hub import hf_hub_download

model_name = "Qwen/Qwen-14B-Chat"
local_dir = f"../../pretrained/{model_name}"
hf_hub_download(
    repo_id=model_name,
    filename="model-00008-of-00015.safetensors",
    cache_dir="/data/jjwang/.cache/hf_cache",
    local_dir=local_dir,
    local_dir_use_symlinks=False,
)

In [None]:
model_name="Qwen/Qwen-14B-Chat"
local_dir="../../pretrained/${model_name}"
nohup huggingface-cli download $model_name --exclude ".safetensors" --local-dir $local_dir --local-dir-use-symlinks False --cache-dir /data/jjwang/.cache/hf_cache > ./downloading.log 2>&1 &

In [None]:
model_name="bert-base-chinese"
local_dir="../../pretrained/${model_name}"
huggingface-cli download $model_name --exclude "*.safetensors" --exclude "*.h5" --exclude "*.ot" --exclude "*.msgpack" --local-dir $local_dir --local-dir-use-symlinks False

# Preprocess and Concat

In [None]:
from pathlib import Path
import pandas as pd

dataset = "QQP"
split = "val"
dataset2lang = {"LCQMC": "zh", "QQP": "en", "BQ": "zh", "MRPC":"en", "QNLI":'en'}
lang = dataset2lang[dataset]
# file_dir = Path(f'generation/results/qwen-14b-chat/QQP/{split}/Rephrase this sentence./')
file_dir = Path(f'generation/results/qwen-14b-chat/{dataset}/{split}/rephrase')

df = pd.DataFrame()

for path in sorted(file_dir.glob("*.jsonl"), key=lambda path: int(path.name.split('-')[0])):
    part = pd.read_json(path, lines=True)
    df = pd.concat((df, part), axis=0)
df.reset_index(drop=True, inplace=True)
df.info()
# df.to_json('data/QQP/train/with_rephrase.jsonl', force_ascii=False, lines=True, orient='records')
# print(df['reason'].str.len().max())
# print(df['reason'].str.len().median())
# print(df['reason'].str.len().mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40430 entries, 0 to 40429
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   question1  40430 non-null  object
 1   question2  40430 non-null  object
 2   label      40430 non-null  int64 
 3   rephrase1  40430 non-null  object
 4   rephrase2  40430 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


In [None]:
import re
from load_data_fns import key_map
# from toolkit.nlp import word_count
# long_mask = ((df['rephrase1'].apply(word_count)>thr) | (df['rephrase2'].apply(word_count)>thr))

df['rephrase1'] = df['rephrase1'].str.strip()
df['rephrase2'] = df['rephrase2'].str.strip()
long_mask = ((df['rephrase1'].str.contains('\n')) | (df['rephrase2'].str.contains('\n')))
print(long_mask.sum())

if lang=='en':
    def postprogess(s: str):
        if '\n' in s:
            splited = s.split('\n')
            if 'rewritten as follows' in splited[0] or 'rephrased as follows' in splited[0] or 'rephrased sentence' in splited[0] or "rephrased version" in splited[0]:
                s =  splited[-1].strip()
                # print(s)
            # elif (m:=re.search(r'(^Rephrased: )(.*)', splited[-1])):
            #     print(s)
            #     s =  m.group(2)
            #     print(s)
            # elif (m:=re.search(r'(^改写后\：|改写为\：)(.*)', splited[-1])):
            #     print(m.group(2))
            #     return m.group(2)
        return s
    df.loc[long_mask, 'rephrase1'] = df[long_mask]['rephrase1'].apply(postprogess)
    df.loc[long_mask, 'rephrase2'] = df[long_mask]['rephrase2'].apply(postprogess)

    long_mask = ((df['rephrase1'].str.contains('\n')) | (df['rephrase2'].str.contains('\n')))
    print(long_mask.sum())

masked_df = df[long_mask]
df = df[~long_mask]

for idx, row in masked_df[:10].iterrows():
    print(row[key_map[dataset][0]], row['rephrase1'], sep='\n------------\n')
    print("===============================================")
    print(row[key_map[dataset][1]], row['rephrase2'], sep='\n------------\n')
    print("######################################################################")

94
35
Does Mike Wazowski blink or wink?
------------
Is Mike Wazowski known for blinking or winking?
Does Mike Wazowski blinks or wink?
------------
Does Mike Wazowski blink or wink? (The same meaning as the original sentence) 

OR

Is it Mike Wazowski blinking or winking? (Slightly different phrasing but with the same meaning)
######################################################################
What the purpose of life on earth?
------------
The intended meaning of the original sentence remains the same after rephrasing:

"What is the reason for our existence on Earth?"
What is ultimate purpose of life?
------------
Can you please provide me with the original sentence so I can rephrase it while maintaining its intended meaning?
######################################################################
My cat had miscarriage 2 days earlier and she is stil bleeding unable to sit properly, seems to be in pain?
------------
The original sentence: "My cat had a miscarriage 2 days earlier and

In [None]:
# pattern = r"[\w ']* original (sentence|text|meaning) .*"
# red_mask = df['rephrase1'].str.contains(pattern, regex=True) | df['rephrase2'].str.contains(pattern, regex=True)
# for idx, row in df[red_mask].iterrows():
#     print(row[key_map[dataset][0]], row['rephrase1'], sep='\n------------\n')
#     print("===============================================")
#     print(row[key_map[dataset][1]], row['rephrase2'], sep='\n------------\n')
#     print("######################################################################")

In [None]:
# if lang=='en':
#     def postprogess(s: str):
#         pattern = r"[\w ']* original (sentence|text|meaning) .*"
#         # if re.search(pattern, s, flags=re.IGNORECASE):
#             # print(s)
#         s = re.sub(pattern, '', s, flags=re.IGNORECASE)
#             # print(s)
#         return s
#     df['rephrase1'] = df['rephrase1'].apply(postprogess)
#     df['rephrase2'] = df['rephrase2'].apply(postprogess)
# for idx, row in df[red_mask].iterrows():
#     print(row[key_map[dataset][0]], row['rephrase1'], sep='\n------------\n')
#     print("===============================================")
#     print(row[key_map[dataset][1]], row['rephrase2'], sep='\n------------\n')
#     print("######################################################################")

In [None]:
import re
# def postprogess(s: str):
#     if re.search(r"[\w ']* original (sentence|text|meaning) .*", s, flags=re.IGNORECASE):
#         print('===============')
#         print(s)
#         print('----------')
#         s = re.sub(r"[\w ']* original (sentence|text|meaning) .*", '', s, flags=re.IGNORECASE)
#         print(s)
#         print('===============')
#     return s
if lang=='en':
    def postprogess(s: str):
        pattern = r"[\w ']* original (sentence|text|meaning) .*"
        # if re.search(pattern, s, flags=re.IGNORECASE):
            # print(s)
        s = re.sub(pattern, '', s, flags=re.IGNORECASE)
            # print(s)
        return s
    df['rephrase1'] = df['rephrase1'].apply(postprogess)
    df['rephrase2'] = df['rephrase2'].apply(postprogess)

    blank_mask = ((df['rephrase1']=='') | (df['rephrase2']==''))
    masked_df = df[blank_mask]
    df = df[~blank_mask]
    print(blank_mask.sum())


155


In [None]:

import re
pt = re.compile(r'rephrase|改写')
fail_mask = ((df['rephrase1'].str.contains(pt, regex=True)) | (df['rephrase2'].str.contains(pt, regex=True)))
print(fail_mask.sum())
masked_df = df[fail_mask]
df = df[~fail_mask]

for idx, row in masked_df[:10].iterrows():
    print(row[key_map[dataset][0]], row['rephrase1'], sep='\n------------\n')
    print("===============================================")
    print(row[key_map[dataset][1]], row['rephrase2'], sep='\n------------\n')
    print("######################################################################")

785
Why did the US invade Iraq in 2003?
------------
What was the reason for the United States' invasion of Iraq in 2003?
What led to the US invading Iraq in 2003?
------------
Can you provide a rephrased version of the sentence that conveys the same meaning as the original?
######################################################################
What is the issue in Baluchistan?
------------
Can you please clarify what specifically you would like me to rephrase?
What exactly is the issue of Baluchistan?
------------
Can you please clarify the specific problem in Baluchistan?
######################################################################
How did you make it into Stanford? What was your high school life like before Stanford?
------------
Can you tell me about your journey to Stanford and what your high school experience was like prior to attending? Please provide details on how you managed to gain admission to this prestigious university.
"""keep looking, don't settle"" its about 

In [None]:
from toolkit.nlp import punctuation_convert
import re
from load_data_fns import key_map
pattern = re.compile(r'^"|"$|^“|”$|^”|“$')

# key_map = {
#     "LCQMC": ("question1", "question2"),
#     "BQ": ("question1", "question2"),
#     "QQP": ("question1", "question2"),
#     "MRPC": ("sentence1", "sentence2"),
# }
df['rephrase1'] = df['rephrase1'].str.strip()
df['rephrase2'] = df['rephrase2'].str.strip()
df[key_map[dataset][0]] = df[key_map[dataset][0]].str.replace(pattern, '', regex=True)
df[key_map[dataset][1]] = df[key_map[dataset][1]].str.replace(pattern, '', regex=True)
df['rephrase1'] = df['rephrase1'].apply(lambda x: punctuation_convert(x, lang).strip()).str.replace(pattern, '', regex=True)
df['rephrase2'] = df['rephrase2'].apply(lambda x: punctuation_convert(x, lang).strip()).str.replace(pattern, '', regex=True)

if (fail_mask:=((df['rephrase1']=='<Failure>') | (df['rephrase2']=='<Failure>'))).any():
    print(fail_mask.sum())
    df = df[~fail_mask]
# if (df['rephrase1']!='<Failure>').all() and (df['rephrase2']!='<Failure>').all():
#     fail_mask = 

# print((df['rephrase1']!='<Failure>').all() and (df['rephrase2']!='<Failure>').all())

In [None]:
import re
if lang=='en':
    pattern = re.compile(r"language model|I'm sorry")
else:
    pattern = re.compile(r"语言模型|很抱歉")

reject_mask = df['rephrase1'].str.contains(pattern, regex=True) | df['rephrase2'].str.contains(pattern, regex=True)
print(reject_mask.sum())
# df[reject_mask].head()
df = df[~reject_mask]

109


In [None]:
from toolkit.nlp import contain_chinese

if lang=='en':
    chinese_mask = df['rephrase1'].apply(contain_chinese) | df['rephrase2'].apply(contain_chinese)
    print(chinese_mask.sum())
    masked_df = df[chinese_mask]
    df = df[~chinese_mask]
else:
    english_mask = ~(df['rephrase1'].apply(contain_chinese) & df['rephrase2'].apply(contain_chinese))
    print(english_mask.sum())
    masked_df = df[english_mask]
    df = df[~english_mask]

for idx, row in masked_df[:3].iterrows():
    print(row['rephrase1'], row['rephrase2'], sep='\n------\n')
    print("=========================")
# df[english_mask].head()

# for idx, row in df[chinese_mask].iterrows():
#     print(row['rephrase1'], row['rephrase2'], sep='\n')
#     print()

374
In Singapore,除了继承财富和中彩票,人们最常采用哪些方式致富?
------
How can one typically achieve wealth creation while working only four days a week?
What is the method for exiting a submerged submarine without allowing water to流入 its interior?
------
How come some marine creatures survive at the bottom of the deepest oceans without being crushed, while submarines implode at a specific depth?
What steps can I take to make my loose vagina tighter?
------
How can I effectively收紧loose vagina?


In [None]:
# # # tmp = df[df['rephrase1'].str.contains(r':|：', regex=True) | df['rephrase2'].str.contains(r':|：', regex=True)]
# # # print(len(tmp))
# long_mask = ((df['rephrase1'].str.contains('\n')) | (df['rephrase2'].str.contains('\n')))
# print(long_mask.sum())
# tmp = df[long_mask]
# for idx, row in tmp.iterrows():
#     print(idx)
#     print('origin')
#     print(row['question1'], row['question2'], sep='\n')
#     print('rephrase')
#     print(row['rephrase1'], row['rephrase2'], sep='\n')
#     print( '--------------------------------------------------------------------------------------------------------------')

In [None]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 38972 entries, 0 to 40429
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   question1  38972 non-null  object
 1   question2  38972 non-null  object
 2   label      38972 non-null  int64 
 3   rephrase1  38972 non-null  object
 4   rephrase2  38972 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.8+ MB


Unnamed: 0,question1,question2,label,rephrase1,rephrase2
0,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0,What makes African-Americans beautiful?,What makes Hispanics so aesthetically pleasing?
1,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0,What are the current open problems in social n...,As someone responsible for managing social med...
2,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1,What might be the motive for traveling solo?,Can you provide me with some justifications fo...
3,Why are people so obsessed with having a girlf...,How can a single male have a child?,0,What drives people's intense desire for a roma...,What is the way for an unmarried man to father...
4,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0,Can you suggest some adorable baby girl names ...,Can you suggest a list of adorable baby girl n...


In [None]:
output_file = f'data/{dataset}/{split}/qwen_with_rephrase_clean.jsonl'
df.to_json(output_file, force_ascii=False, lines=True, orient='records')

In [None]:
# import pandas as pd
# # output_file = 'data/QQP/train/with_rephrase.jsonl'
# df = pd.read_json(output_file, lines=True)

# from toolkit.nlp import contain_chinese
# df[df['rephrase1'].apply(contain_chinese) | df['rephrase2'].apply(contain_chinese)]

In [None]:
# import pandas as pd
# from pathlib import Path

# splits = ['train', 'val', 'test']
# for split in splits:
#     data_file = f"data/BQ/clean/{split}_clean.txt"
#     with open(data_file, 'r') as f:
#         lines = f.readlines()
#     ret = {'question1':[], 'question2':[], 'label': []}
#     for line in lines:
#         q1, q2, label = line.split('\t')
#         q1, q2, label = q1.strip(), q2.strip(), int(label)
#         ret['question1'].append(q1)
#         ret['question2'].append(q2)
#         ret['label'].append(label)
#     df = pd.DataFrame(ret)
#     output_dir = Path(f"data/BQ/{split}")
#     output_dir.mkdir(exist_ok=True)
#     df.to_json(output_dir/"all.jsonl", orient='records', lines=True, force_ascii=False)
#     # break

### no drop

In [None]:
#########

In [None]:
from pathlib import Path
import pandas as pd

dataset = "QQP"
split = "val"
dataset2lang = {"LCQMC": "zh", "QQP": "en", "BQ": "zh", "MRPC":"en"}
lang = dataset2lang[dataset]
# file_dir = Path(f'generation/results/qwen-14b-chat/QQP/{split}/Rephrase this sentence./')
file_dir = Path(f'generation/results/qwen-14b-chat/{dataset}/{split}/rephrase')

df = pd.DataFrame()

for path in sorted(file_dir.glob("*.jsonl"), key=lambda path: int(path.name.split('-')[0])):
    part = pd.read_json(path, lines=True)
    df = pd.concat((df, part), axis=0)
df.reset_index(drop=True, inplace=True)
df.info()
# df.to_json('data/QQP/train/with_rephrase.jsonl', force_ascii=False, lines=True, orient='records')
# print(df['reason'].str.len().max())
# print(df['reason'].str.len().median())
# print(df['reason'].str.len().mean())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40430 entries, 0 to 40429
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   question1  40430 non-null  object
 1   question2  40430 non-null  object
 2   label      40430 non-null  int64 
 3   rephrase1  40430 non-null  object
 4   rephrase2  40430 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


In [None]:
import re
from load_data_fns import key_map
# from toolkit.nlp import word_count
# long_mask = ((df['rephrase1'].apply(word_count)>thr) | (df['rephrase2'].apply(word_count)>thr))

df['rephrase1'] = df['rephrase1'].str.strip()
df['rephrase2'] = df['rephrase2'].str.strip()
long_mask = ((df['rephrase1'].str.contains('\n')) | (df['rephrase2'].str.contains('\n')))
print(long_mask.sum())

if lang=='en':
    def postprogess(s: str):
        if '\n' in s:
            splited = s.split('\n')
            if 'rewritten as follows' in splited[0] or 'rephrased as follows' in splited[0] or 'rephrased sentence' in splited[0] or "rephrased version" in splited[0]:
                s =  splited[-1].strip()
                # print(s)
            # elif (m:=re.search(r'(^Rephrased: )(.*)', splited[-1])):
            #     print(s)
            #     s =  m.group(2)
            #     print(s)
            # elif (m:=re.search(r'(^改写后\：|改写为\：)(.*)', splited[-1])):
            #     print(m.group(2))
            #     return m.group(2)
        return s
    df.loc[long_mask, 'rephrase1'] = df[long_mask]['rephrase1'].apply(postprogess)
    df.loc[long_mask, 'rephrase2'] = df[long_mask]['rephrase2'].apply(postprogess)

    long_mask = ((df['rephrase1'].str.contains('\n')) | (df['rephrase2'].str.contains('\n')))
    print(long_mask.sum())
mask = long_mask

94
35


In [None]:
import re
# def postprogess(s: str):
#     if re.search(r"[\w ']* original (sentence|text|meaning) .*", s, flags=re.IGNORECASE):
#         print('===============')
#         print(s)
#         print('----------')
#         s = re.sub(r"[\w ']* original (sentence|text|meaning) .*", '', s, flags=re.IGNORECASE)
#         print(s)
#         print('===============')
#     return s
if lang=='en':
    def postprogess(s: str):
        s = re.sub(r"[\w ']* original (sentence|text|meaning) .*", '', s, flags=re.IGNORECASE)
        return s
    df['rephrase1'] = df['rephrase1'].apply(postprogess)
    df['rephrase2'] = df['rephrase2'].apply(postprogess)

    blank_mask = ((df['rephrase1']=='') | (df['rephrase2']==''))
    masked_df = df[blank_mask]
    # df = df[~blank_mask]
    print(blank_mask.sum())
    mask = mask|blank_mask
    print(mask.sum())

157
190


In [None]:
import re
pt = re.compile(r'rephrase|改写')
fail_mask = ((df['rephrase1'].str.contains(pt, regex=True)) | (df['rephrase2'].str.contains(pt, regex=True)))
print(fail_mask.sum())
mask = mask|fail_mask
print(mask.sum())

794
975


In [None]:
from toolkit.nlp import punctuation_convert
import re
from load_data_fns import key_map
pattern = re.compile(r'^"|"$|^“|”$|^”|“$')

df['rephrase1'] = df['rephrase1'].str.strip()
df['rephrase2'] = df['rephrase2'].str.strip()
df[key_map[dataset][0]] = df[key_map[dataset][0]].str.replace(pattern, '', regex=True)
df[key_map[dataset][1]] = df[key_map[dataset][1]].str.replace(pattern, '', regex=True)
df['rephrase1'] = df['rephrase1'].apply(lambda x: punctuation_convert(x, lang).strip()).str.replace(pattern, '', regex=True)
df['rephrase2'] = df['rephrase2'].apply(lambda x: punctuation_convert(x, lang).strip()).str.replace(pattern, '', regex=True)

if (fail_mask:=((df['rephrase1']=='<Failure>') | (df['rephrase2']=='<Failure>'))).any():
    print(fail_mask.sum())
    mask = mask|fail_mask
    print(mask.sum())

In [None]:
import re
if lang=='en':
    pattern = re.compile(r"language model|I'm sorry")
else:
    pattern = re.compile(r"语言模型|很抱歉")

reject_mask = df['rephrase1'].str.contains(pattern, regex=True) | df['rephrase2'].str.contains(pattern, regex=True)
print(reject_mask.sum())
mask = mask|reject_mask
print(mask.sum())

115
1084


In [None]:
from toolkit.nlp import contain_chinese

if lang=='en':
    chinese_mask = df['rephrase1'].apply(contain_chinese) | df['rephrase2'].apply(contain_chinese)
    print(chinese_mask.sum())
    masked_df = df[chinese_mask]
    mask = mask|chinese_mask
    print(mask.sum())
    # df = df[~chinese_mask]
else:
    english_mask = ~(df['rephrase1'].apply(contain_chinese) & df['rephrase2'].apply(contain_chinese))
    print(english_mask.sum())
    masked_df = df[english_mask]
    mask = mask|english_mask
    print(mask.sum())
    # df = df[~english_mask]

for idx, row in masked_df[:3].iterrows():
    print(row['rephrase1'], row['rephrase2'], sep='\n------\n')
    print("=========================")


382
1458
In Singapore,除了继承财富和中彩票,人们最常采用哪些方式致富?
------
How can one typically achieve wealth creation while working only four days a week?
What is the method for exiting a submerged submarine without allowing water to流入 its interior?
------
How come some marine creatures survive at the bottom of the deepest oceans without being crushed, while submarines implode at a specific depth?
What steps can I take to make my loose vagina tighter?
------
How can I effectively收紧loose vagina?


In [None]:
df.loc[mask, 'rephrase1'] = df.loc[mask, key_map[dataset][0]]
df.loc[mask, 'rephrase2'] = df.loc[mask, key_map[dataset][1]]
df[mask].head()

Unnamed: 0,question1,question2,label,rephrase1,rephrase2
21,"What is the Sahara, and how do the average tem...","What is the Sahara, and how do the average tem...",1,"What is the Sahara, and how do the average tem...","What is the Sahara, and how do the average tem..."
97,What are the most common ways people get rich ...,Wealth Creation: What are the most common ways...,0,What are the most common ways people get rich ...,Wealth Creation: What are the most common ways...
203,How does a pussy taste?,What does pussy smell like?,0,How does a pussy taste?,What does pussy smell like?
272,Does Mike Wazowski blink or wink?,Does Mike Wazowski blinks or wink?,1,Does Mike Wazowski blink or wink?,Does Mike Wazowski blinks or wink?
360,Why did the US invade Iraq in 2003?,What led to the US invading Iraq in 2003?,1,Why did the US invade Iraq in 2003?,What led to the US invading Iraq in 2003?


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40430 entries, 0 to 40429
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   question1  40430 non-null  object
 1   question2  40430 non-null  object
 2   label      40430 non-null  int64 
 3   rephrase1  40430 non-null  object
 4   rephrase2  40430 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


In [None]:
output_file = f'data/{dataset}/{split}/qwen_with_rephrase_clean_nodrop.jsonl'
df.to_json(output_file, force_ascii=False, lines=True, orient='records')

# Check

## diversity

### LCQMC

In [3]:
import pandas as pd

dataset = "LCQMC"
split = "train"

output_file = f"data/{dataset}/{split}/qwen_with_rephrase_clean_nodrop.jsonl"
df = pd.read_json(output_file, lines=True)
df.head()

Unnamed: 0,question1,question2,label,rephrase1,rephrase2
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1,喜欢打篮球的男生通常喜欢什么样的女生？,喜欢打篮球的男生喜欢怎样的女生？
1,我手机丢了，我想换个手机,我想买个新手机，求推荐,1,由于我的手机丢失了，我打算更换一部新的手机。,请问有什么好的手机推荐吗？我打算购买一个新的。
2,大家觉得她好看吗,大家觉得跑男好看吗？,0,大家认为她长得漂亮吗？,大家认为《奔跑吧》这个节目怎么样？
3,求秋色之空漫画全集,求秋色之空全集漫画,1,请问哪里可以找到秋色之空漫画的全集？,请问哪里可以找到秋色之空全集的漫画？
4,晚上睡觉带着耳机听音乐有什么害处吗？,孕妇可以戴耳机听音乐吗?,0,戴着耳机在晚上睡觉听音乐会有哪些不良影响？,孕妇是否可以使用耳机听音乐？


In [4]:
from toolkit.metric import self_bleu

s = []
for _, row in df.iterrows():
    # print(row)
    s.append([row["question1"], row["rephrase1"]])
    s.append([row["question2"], row["rephrase2"]])

score = self_bleu(s, language="zh", smoothing_level=1)
score

Calculate self-bleu:   0%|          | 0/477532 [00:00<?, ?it/s]

{'self-bleu4': 0.2524336224560287}

In [1]:
import pandas as pd

dataset = "LCQMC"
split = "train"

output_file = f"data/{dataset}/{split}/qwen_with_rephrase_clean_nodrop.jsonl"
df = pd.read_json(output_file, lines=True)
df.head()

Unnamed: 0,question1,question2,label,rephrase1,rephrase2
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1,喜欢打篮球的男生通常喜欢什么样的女生？,喜欢打篮球的男生喜欢怎样的女生？
1,我手机丢了，我想换个手机,我想买个新手机，求推荐,1,由于我的手机丢失了，我打算更换一部新的手机。,请问有什么好的手机推荐吗？我打算购买一个新的。
2,大家觉得她好看吗,大家觉得跑男好看吗？,0,大家认为她长得漂亮吗？,大家认为《奔跑吧》这个节目怎么样？
3,求秋色之空漫画全集,求秋色之空全集漫画,1,请问哪里可以找到秋色之空漫画的全集？,请问哪里可以找到秋色之空全集的漫画？
4,晚上睡觉带着耳机听音乐有什么害处吗？,孕妇可以戴耳机听音乐吗?,0,戴着耳机在晚上睡觉听音乐会有哪些不良影响？,孕妇是否可以使用耳机听音乐？


In [3]:
from toolkit.metric import rouge

preds = []
labels = []
for _, row in df.iterrows():
    preds.extend([row["rephrase1"], row["rephrase2"]])
    labels.extend([row["question1"], row["question2"]])
    # s.append([row["question1"], row["rephrase1"]])
    # s.append([row["question2"], row["rephrase2"]])

score = rouge(preds, labels, "zh")
score

Calculating rouge:   0%|          | 0/477532 [00:00<?, ?it/s]

{'rougeL': 0.5372641115171586}

In [2]:
from toolkit.metric import rouge
from tqdm.auto import tqdm

ret = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    ret.append({"ori": row["question1"], "rep": row["rephrase1"], "rougeL": rouge(row["rephrase1"], row["question1"], "zh", tqdm=False)["rougeL"]})
    ret.append({"ori": row["question2"], "rep": row["rephrase2"], "rougeL": rouge(row["rephrase2"], row["question2"], "zh", tqdm=False)["rougeL"]})
    # break

df_rougel = pd.DataFrame(ret)

[2024-04-13 01:16:25,961] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  0%|          | 0/238766 [00:00<?, ?it/s]

In [None]:
for _, row in df_rougel[df_rougel["rougeL"] < 0.3].iterrows():
    print(row["ori"], "\t", row["rep"], "\t", row["rougeL"])

In [None]:
怎么学会沟通 	 如何掌握沟通技巧？ 	 0.2666666805744171
什么鬼片好看 	 哪些恐怖电影值得一看？
怎么保养皮肤好 	 如何做好皮肤保养？ 	 0.25
房子装修流程是什么？ 	 装修房子需要遵循哪些步骤？ 	 0.260869562625885


### QQP

In [8]:
import pandas as pd

dataset = "QQP"
split = "train"

output_file = f"data/{dataset}/{split}/qwen_with_rephrase_clean_nodrop.jsonl"
df = pd.read_json(output_file, lines=True)
df.head()

Unnamed: 0,question1,question2,label,rephrase1,rephrase2
0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0,Can you share your experiences and describe wh...,What degree of preparation is necessary for pa...
1,How do I control my horny emotions?,How do you control your horniness?,1,What strategies can I use to manage my sexual ...,What methods do you use to manage your sexual ...
2,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0,Can you explain what factors might lead to a y...,What are the possible reasons for stool appear...
3,What can one do after MBBS?,What do i do after my MBBS ?,1,What are the post-MBBS options available?,What steps should I take post-completion of my...
4,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0,How can I locate an electrical socket for char...,If a high-speed rail link were established bet...


In [9]:
from toolkit.metric import self_bleu

# df = df[:10000]
s = []
for _, row in df.iterrows():
    # print(row)
    s.append([row["question1"], row["rephrase1"]])
    s.append([row["question2"], row["rephrase2"]])

score = self_bleu(s, language="en", smoothing_level=1)
score

Calculate self-bleu:   0%|          | 0/727692 [00:00<?, ?it/s]

{'self-bleu4': 0.11915685605405273}

In [10]:
from toolkit.metric import rouge
from tqdm.auto import tqdm

ret = []
df = df[:10000]
for _, row in tqdm(df.iterrows(), total=len(df)):
    ret.append({"ori": row["question1"], "rep": row["rephrase1"], "rougeL": rouge(row["rephrase1"], row["question1"], "en", tqdm=False)["rougeL"]})
    ret.append({"ori": row["question2"], "rep": row["rephrase2"], "rougeL": rouge(row["rephrase2"], row["question2"], "en", tqdm=False)["rougeL"]})
    # break

df_rougel = pd.DataFrame(ret)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [13]:
sum(df_rougel["rougeL"] < 0.2)

4216

In [None]:
for _, row in df_rougel[df_rougel["rougeL"] < 0.2].iterrows():
    print(row["ori"], "\t", row["rep"], "\t", row["rougeL"])

In [None]:
How do I lose weight fast? 	 What's a quick way to shed pounds? 	 0.0
What is the best way to reduce weight fast? 	 How can one quickly lose weight effectively? 	 0.125
How can I speak English fluently and fast? 	 What are the ways to achieve fluent and rapid speaking in English? 	 0.10000000149011612
What do I do to increase my height? 	 How can I grow taller? 	 0.1538461595773697

### BQ

In [12]:
import pandas as pd

dataset = "BQ"
split = "train"

output_file = f"data/{dataset}/{split}/qwen_with_rephrase_clean_nodrop.jsonl"
df = pd.read_json(output_file, lines=True)

from toolkit.metric import self_bleu

# df = df[:10000]
s = []
for _, row in df.iterrows():
    # print(row)
    s.append([row["question1"], row["rephrase1"]])
    s.append([row["question2"], row["rephrase2"]])

score = self_bleu(s, language="zh", smoothing_level=1)
score

Calculate self-bleu:   0%|          | 0/200000 [00:00<?, ?it/s]

{'self-bleu4': 0.213254426817174}

### MRPC

In [13]:
import pandas as pd

dataset = "MRPC"
split = "train"

output_file = f"data/{dataset}/{split}/qwen_with_rephrase_clean_nodrop.jsonl"
df = pd.read_json(output_file, lines=True)

from toolkit.metric import self_bleu

# df = df[:10000]
s = []
for _, row in df.iterrows():
    # print(row)
    s.append([row["sentence1"], row["rephrase1"]])
    s.append([row["sentence2"], row["rephrase2"]])

score = self_bleu(s, language="en", smoothing_level=1)
score

Calculate self-bleu:   0%|          | 0/7336 [00:00<?, ?it/s]

{'self-bleu4': 0.13904351701205758}

## hallucination

### LCQMC

In [35]:
# import json

# with open("outputs/LCQMC/bert-base-chinese/ORI/all/Baseline_nodrop_baseline/3/16/3e-05/62/evaluator/step=29846/ANY.json", "r") as f:
#     metric_dict_load, _, _, controversial_cases, confused_cases, definite_cases = json.load(f)

# all_cases = dict()
# all_cases.update(controversial_cases)
# all_cases.update(confused_cases)
# all_cases.update(definite_cases)
# from sklearn.metrics import accuracy_score

# all_labels = [value["labels"] for value in all_cases.values()]
# all_preds = [value["ori_labels"][0] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds), 2))

# all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

# print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

In [36]:
from load_data_fns import LOAD_DATA_FNS, DatasetName, TextType

# from torch.utils.data import DataLoader
from transformers import RobertaTokenizer
from toolkit.nlp import TextDataset
from toolkit.nlp import NLPTrainingConfig
from toolkit.enums import Split
from transformers import AutoTokenizer

NLPTrainingConfig.log_custom_param = False

split = "train"
model_path = "outputs/LCQMC/bert-base-chinese/ORI/all/Baseline_nodrop_baseline/3/16/3e-05/62/optimal_checkpoint"
# model_path = "outputs/LCQMC/bert-base-chinese/DATA_AUG_REP4/all/nodrop_single_model_hardcases_from_baseline_warmboost_fix_num_ratio=0.8/seed_of_stage1=109/1/16/2e-06/11/optimal_checkpoint"
# model_type = model_path.split("/")[2]
# dataset_name  = model_path.split("/")[1]
config = NLPTrainingConfig.load(model_path)
# config.text_type = "CHECK_HAL2"
config.text_type = "DATA_AUG_REP4"

# config.test_file_path="./data/LCQMC/test/qwen_with_rephrase_clean_nodrop.jsonl"
tokenizer = AutoTokenizer.from_pretrained(model_path)

dataset = TextDataset.from_file(
    config.train_file_path,
    tokenizer,
    split=Split.ANY,
    configs=config,
    load_data_fn=LOAD_DATA_FNS[DatasetName[config.dataset_name]],
    text_type=TextType[config.text_type],
    dataset_name=DatasetName[config.dataset_name],
    use_cache=True,
)
dataset.report()

2024-06-09 20:37:37,048 <DEBUG> TextDataset: ⏳ Loading ANY dataset ...[0m
2024-06-09 20:37:37,049 <DEBUG> TextDataset: 🔒 Applying for read lock ...[0m
2024-06-09 20:37:37,049 <DEBUG> TextDataset: 💿 Loading dataset from cache ...[0m
2024-06-09 20:38:04,297 <DEBUG> TextDataset: ✔️  Load successfully.[0m
2024-06-09 20:38:04,302 <DEBUG> TextDataset: ⌛ Loading ANY data takes 27.25 sec.[0m
2024-06-09 20:38:04,307[32m <INFO> [toolkit]: Total TRAINING data: 238766[0m
2024-06-09 20:38:04,313[32m <INFO> [toolkit]: Max length of input: 241[0m
2024-06-09 20:38:04,318[32m <INFO> [toolkit]: Max length of label: 1[0m
2024-06-09 20:38:07,961[32m <INFO> [toolkit]: ✂️  Truncating ANY data: cnt=0, input_len=241, label_len=1[0m
2024-06-09 20:38:07,975[32m <INFO> [toolkit]: Total TRAINING data: 238766[0m
2024-06-09 20:38:07,982[32m <INFO> [toolkit]: Max length of input: 241[0m
2024-06-09 20:38:07,989[32m <INFO> [toolkit]: Max length of label: 1[0m


In [37]:
# from toolkit.training import Evaluator
from model.MatchModel_binary_classification import BertModel_binary_classify, RobertaModel_binary_classify, RobertaModel_rephrase, BertModel_rephrase

from toolkit.enums import Split
from utils.evaluate import Evaluator1

if "roberta" in config.model_type:
    pass
else:
    model = BertModel_rephrase.from_pretrained(model_path)
config.batch_size_infer = 64
Evaluator1.confused_use_ot = True
Evaluator1.save_results = False
Evaluator1.just_return_metric = False
evaluator = Evaluator1(
    "classify",
    Split.ANY,
    config=config,
    model=model,
    tokenizer=tokenizer,
    dataset=dataset,
    # dataset_name=DatasetName[dataset_name],
    extral_args_evaluation={"is_train": False},
)
metric_dict, bad_cases, good_cases_idxs, controversial_cases, confused_cases, definite_cases, all_logits, all_labels = evaluator.eval(cuda_id=1)
print(metric_dict)

ANY:   0%|          | 0/3731 [00:00<?, ?batch/s]

Consistent:
171586/238766  71.86%
acc: 97.21%	f1: 96.90%

Inconsistent: 
67180/238766  28.14%
acc: 93.15%	f1: 96.25%

controversial:
33369/238766  13.98%
acc: 94.62%	f1: 97.14%

confused:
33811/238766  14.16%
acc: 91.70%	f1: 95.32%

definite:
171586/238766  71.86%
acc: 97.21%	f1: 96.90%

4602.0
94.62
{'accuracy': 96.0647663402662, 'F1-score': 96.6121251018598, 'loss': 1.687476546522824}


In [38]:
all_cases = dict()
all_cases.update(controversial_cases)
all_cases.update(confused_cases)
all_cases.update(definite_cases)
from sklearn.metrics import accuracy_score

all_labels = [value["labels"] for value in all_cases.values()]
all_preds = [value["ori_labels"][0] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds), 2))

all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

0.96
0.82
0.83


### BQ

In [29]:
# import json

# with open("outputs/BQ/bert-base-chinese/ORI/all/Baseline_nodrop_baseline/3/16/3e-05/97/evaluator/step=12500/ANY.json", "r") as f:
#     metric_dict_load, _, _, controversial_cases, confused_cases, definite_cases = json.load(f)

# all_cases = dict()
# all_cases.update(controversial_cases)
# all_cases.update(confused_cases)
# all_cases.update(definite_cases)
# from sklearn.metrics import accuracy_score

# all_labels = [value["labels"] for value in all_cases.values()]
# all_preds = [value["ori_labels"][0] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds), 2))

# all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

# print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

In [32]:
from load_data_fns import LOAD_DATA_FNS, DatasetName, TextType

# from torch.utils.data import DataLoader
from transformers import RobertaTokenizer
from toolkit.nlp import TextDataset
from toolkit.nlp import NLPTrainingConfig
from toolkit.enums import Split
from transformers import AutoTokenizer

NLPTrainingConfig.log_custom_param = False

split = "train"
model_path = "outputs/BQ/bert-base-chinese/ORI/all/Baseline_nodrop_baseline/3/16/3e-05/97/optimal_checkpoint"
# model_path = "outputs/LCQMC/bert-base-chinese/DATA_AUG_REP4/all/nodrop_single_model_hardcases_from_baseline_warmboost_fix_num_ratio=0.8/seed_of_stage1=109/1/16/2e-06/11/optimal_checkpoint"
# model_type = model_path.split("/")[2]
# dataset_name  = model_path.split("/")[1]
config = NLPTrainingConfig.load(model_path)
# config.text_type = "CHECK_HAL2"
config.text_type = "DATA_AUG_REP4"

# config.test_file_path="./data/LCQMC/test/qwen_with_rephrase_clean_nodrop.jsonl"
tokenizer = AutoTokenizer.from_pretrained(model_path)

dataset = TextDataset.from_file(
    config.train_file_path,
    tokenizer,
    split=Split.ANY,
    configs=config,
    load_data_fn=LOAD_DATA_FNS[DatasetName[config.dataset_name]],
    text_type=TextType[config.text_type],
    dataset_name=DatasetName[config.dataset_name],
    use_cache=True,
)
dataset.report()

2024-06-09 20:30:09,473 <DEBUG> TextDataset: ⏳ Loading ANY dataset ...[0m
2024-06-09 20:30:09,475 <DEBUG> TextDataset: 🔒 Applying for read lock ...[0m
2024-06-09 20:30:09,475 <DEBUG> TextDataset: 💿 Loading dataset from cache ...[0m


2024-06-09 20:30:18,187 <DEBUG> TextDataset: ✔️  Load successfully.[0m
2024-06-09 20:30:18,195 <DEBUG> TextDataset: ⌛ Loading ANY data takes 8.72 sec.[0m
2024-06-09 20:30:18,200[32m <INFO> [toolkit]: Total ANY data: 100000[0m
2024-06-09 20:30:18,204[32m <INFO> [toolkit]: Max length of input: 237[0m
2024-06-09 20:30:18,209[32m <INFO> [toolkit]: Max length of label: 1[0m
2024-06-09 20:30:19,666[32m <INFO> [toolkit]: ✂️  Truncating ANY data: cnt=0, input_len=237, label_len=1[0m
2024-06-09 20:30:19,780[32m <INFO> [toolkit]: Total ANY data: 100000[0m
2024-06-09 20:30:19,786[32m <INFO> [toolkit]: Max length of input: 237[0m
2024-06-09 20:30:19,792[32m <INFO> [toolkit]: Max length of label: 1[0m


In [33]:
# from toolkit.training import Evaluator
from model.MatchModel_binary_classification import BertModel_binary_classify, RobertaModel_binary_classify, RobertaModel_rephrase, BertModel_rephrase

from toolkit.enums import Split
from utils.evaluate import Evaluator1

if "roberta" in config.model_type:
    pass
else:
    model = BertModel_rephrase.from_pretrained(model_path)
config.batch_size_infer = 64
Evaluator1.confused_use_ot = True
Evaluator1.save_results = False
Evaluator1.just_return_metric = False
evaluator = Evaluator1(
    "classify",
    Split.ANY,
    config=config,
    model=model,
    tokenizer=tokenizer,
    dataset=dataset,
    # dataset_name=DatasetName[dataset_name],
    extral_args_evaluation={"is_train": False},
)
metric_dict, bad_cases, good_cases_idxs, controversial_cases, confused_cases, definite_cases, all_logits, all_labels = evaluator.eval(cuda_id=1)
print(metric_dict)

ANY:   0%|          | 0/1563 [00:00<?, ?batch/s]

Consistent:
82560/100000  82.56%
acc: 98.51%	f1: 98.43%

Inconsistent: 
17440/100000  17.44%
acc: 94.66%	f1: 95.84%

controversial:
9044/100000  9.04%
acc: 95.62%	f1: 96.76%

confused:
8396/100000  8.40%
acc: 93.62%	f1: 94.74%

definite:
82560/100000  82.56%
acc: 98.51%	f1: 98.43%

932.0
96.68
{'accuracy': 97.83800000000001, 'F1-score': 97.85046728971962, 'loss': 0.9704271189615807}


In [34]:
all_cases = dict()
all_cases.update(controversial_cases)
all_cases.update(confused_cases)
all_cases.update(definite_cases)
from sklearn.metrics import accuracy_score

all_labels = [value["labels"] for value in all_cases.values()]
all_preds = [value["ori_labels"][0] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds), 2))

all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

0.98
0.85
0.86


### QQP

In [39]:
# import json

# with open("outputs/QQP/roberta-base/ORI/all/Baseline_nodrop_baseline/3/16/3e-05/31/evaluator/step=68223/ANY.json", "r") as f:
#     metric_dict_load, _, _, controversial_cases, confused_cases, definite_cases = json.load(f)

# all_cases = dict()
# all_cases.update(controversial_cases)
# all_cases.update(confused_cases)
# all_cases.update(definite_cases)
# from sklearn.metrics import accuracy_score

# all_labels = [value["labels"] for value in all_cases.values()]
# all_preds = [value["ori_labels"][0] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds), 2))

# all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

# print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

In [46]:
from load_data_fns import LOAD_DATA_FNS, DatasetName, TextType

# from torch.utils.data import DataLoader
from transformers import RobertaTokenizer
from toolkit.nlp import TextDataset
from toolkit.nlp import NLPTrainingConfig
from toolkit.enums import Split
from transformers import AutoTokenizer

NLPTrainingConfig.log_custom_param = False
split = "train"
# model_path = "outputs/QQP/bert-base-uncased/ORI/all/Baseline_nodrop_baseline/3/16/3e-05/78/optimal_checkpoint"
model_path = "outputs/QQP/roberta-base/ORI/all/Baseline_nodrop_baseline/3/16/3e-05/23/optimal_checkpoint"

# model_path = "outputs/LCQMC/bert-base-chinese/DATA_AUG_REP4/all/nodrop_single_model_hardcases_from_baseline_warmboost_fix_num_ratio=0.8/seed_of_stage1=109/1/16/2e-06/11/optimal_checkpoint"
# model_type = model_path.split("/")[2]
# dataset_name  = model_path.split("/")[1]
config = NLPTrainingConfig.load(model_path)
# config.text_type = "CHECK_HAL2"
config.text_type = "DATA_AUG_REP4"
# config.test_file_path="./data/LCQMC/test/qwen_with_rephrase_clean_nodrop.jsonl"
tokenizer = AutoTokenizer.from_pretrained(model_path)

dataset = TextDataset.from_file(
    config.train_file_path,
    tokenizer,
    split=Split.ANY,
    configs=config,
    load_data_fn=LOAD_DATA_FNS[DatasetName[config.dataset_name]],
    text_type=TextType[config.text_type],
    dataset_name=DatasetName[config.dataset_name],
    use_cache=True,
)
dataset.report()

2024-06-09 21:05:51,563 <DEBUG> TextDataset: ⏳ Loading ANY dataset ...[0m
2024-06-09 21:05:51,564 <DEBUG> TextDataset: 🔒 Applying for read lock ...[0m
2024-06-09 21:05:51,570 <DEBUG> TextDataset: 💿 Loading dataset from cache ...[0m


2024-06-09 21:06:49,538 <DEBUG> TextDataset: ✔️  Load successfully.[0m
2024-06-09 21:06:49,548 <DEBUG> TextDataset: ⌛ Loading ANY data takes 57.98 sec.[0m
2024-06-09 21:06:49,554[32m <INFO> [toolkit]: Total TRAINING data: 363846[0m
2024-06-09 21:06:49,558[32m <INFO> [toolkit]: Max length of input: 331[0m
2024-06-09 21:06:49,564[32m <INFO> [toolkit]: Max length of label: 1[0m
2024-06-09 21:06:56,035[32m <INFO> [toolkit]: ✂️  Truncating ANY data: cnt=0, input_len=331, label_len=1[0m
2024-06-09 21:06:56,060[32m <INFO> [toolkit]: Total TRAINING data: 363846[0m
2024-06-09 21:06:56,070[32m <INFO> [toolkit]: Max length of input: 331[0m
2024-06-09 21:06:56,082[32m <INFO> [toolkit]: Max length of label: 1[0m


In [47]:
# from toolkit.training import Evaluator
from model.MatchModel_binary_classification import BertModel_binary_classify, RobertaModel_binary_classify, RobertaModel_rephrase, BertModel_rephrase

from toolkit.enums import Split
from utils.evaluate import Evaluator1

if "roberta" in config.model_type:
    pass
else:
    model = BertModel_rephrase.from_pretrained(model_path)
config.batch_size_infer = 64
Evaluator1.confused_use_ot = True
Evaluator1.save_results = False
Evaluator1.just_return_metric = False
evaluator = Evaluator1(
    "classify",
    Split.ANY,
    config=config,
    model=model,
    tokenizer=tokenizer,
    dataset=dataset,
    # dataset_name=DatasetName[dataset_name],
    extral_args_evaluation={"is_train": False},
)
metric_dict, bad_cases, good_cases_idxs, controversial_cases, confused_cases, definite_cases, all_logits, all_labels = evaluator.eval(cuda_id=1)
print(metric_dict)

ANY:   0%|          | 0/5686 [00:00<?, ?batch/s]

Consistent:
241383/363846  66.34%
acc: 78.94%	f1: 67.72%

Inconsistent: 
122463/363846  33.66%
acc: 50.39%	f1: 61.78%

controversial:
49773/363846  13.68%
acc: 48.24%	f1: 62.33%

confused:
72690/363846  19.98%
acc: 51.86%	f1: 61.37%

definite:
241383/363846  66.34%
acc: 78.94%	f1: 67.72%

60753.0
81.99
{'accuracy': 69.3320800558478, 'F1-score': 64.73592395014285, 'loss': 3.5427050873967403}


In [48]:
all_cases = dict()
all_cases.update(controversial_cases)
all_cases.update(confused_cases)
all_cases.update(definite_cases)
from sklearn.metrics import accuracy_score

all_labels = [value["labels"] for value in all_cases.values()]
all_preds = [value["ori_labels"][0] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds), 2))

all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

0.69
0.7
0.82


### MRPC

In [28]:
# import json

# with open("outputs/MRPC/roberta-base/ORI/all/Baseline_nodrop_baseline/3/16/2e-05/13/evaluator/step=575/ANY.json", "r") as f:
#     metric_dict_load, _, _, controversial_cases, confused_cases, definite_cases = json.load(f)

# all_cases = dict()
# all_cases.update(controversial_cases)
# all_cases.update(confused_cases)
# all_cases.update(definite_cases)
# from sklearn.metrics import accuracy_score

# all_labels = [value["labels"] for value in all_cases.values()]
# all_preds = [value["ori_labels"][0] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds), 2))

# all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
# print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

# print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

In [43]:
from load_data_fns import LOAD_DATA_FNS, DatasetName, TextType

# from torch.utils.data import DataLoader
from transformers import RobertaTokenizer
from toolkit.nlp import TextDataset
from toolkit.nlp import NLPTrainingConfig
from toolkit.enums import Split
from transformers import AutoTokenizer

NLPTrainingConfig.log_custom_param = False

split = "train"
# model_path = "outputs/MRPC/bert-base-uncased/ORI/all/Baseline_nodrop_baseline/3/16/2e-05/43/optimal_checkpoint"
model_path = "outputs/MRPC/roberta-base/ORI/all/Baseline_nodrop_baseline/3/16/2e-05/13/optimal_checkpoint"

# model_path = "outputs/LCQMC/bert-base-chinese/DATA_AUG_REP4/all/nodrop_single_model_hardcases_from_baseline_warmboost_fix_num_ratio=0.8/seed_of_stage1=109/1/16/2e-06/11/optimal_checkpoint"
# model_type = model_path.split("/")[2]
# dataset_name  = model_path.split("/")[1]
config = NLPTrainingConfig.load(model_path)
# config.text_type = "CHECK_HAL2"
config.text_type = "DATA_AUG_REP4"
# config.test_file_path="./data/LCQMC/test/qwen_with_rephrase_clean_nodrop.jsonl"
tokenizer = AutoTokenizer.from_pretrained(model_path)

dataset = TextDataset.from_file(
    config.train_file_path,
    tokenizer,
    split=Split.ANY,
    configs=config,
    load_data_fn=LOAD_DATA_FNS[DatasetName[config.dataset_name]],
    text_type=TextType[config.text_type],
    dataset_name=DatasetName[config.dataset_name],
    use_cache=True,
)
dataset.report()

2024-06-09 21:05:16,346 <DEBUG> TextDataset: ⏳ Loading ANY dataset ...[0m
2024-06-09 21:05:16,347 <DEBUG> TextDataset: 🔒 Applying for read lock ...[0m
2024-06-09 21:05:16,348 <DEBUG> TextDataset: 💿 Loading dataset from cache ...[0m
2024-06-09 21:05:16,955 <DEBUG> TextDataset: ✔️  Load successfully.[0m
2024-06-09 21:05:16,960 <DEBUG> TextDataset: ⌛ Loading ANY data takes 0.61 sec.[0m
2024-06-09 21:05:16,961[32m <INFO> [toolkit]: Total ANY data: 3668[0m
2024-06-09 21:05:16,962[32m <INFO> [toolkit]: Max length of input: 112[0m
2024-06-09 21:05:16,962[32m <INFO> [toolkit]: Max length of label: 1[0m
2024-06-09 21:05:17,037[32m <INFO> [toolkit]: ✂️  Truncating ANY data: cnt=0, input_len=112, label_len=1[0m
2024-06-09 21:05:17,038[32m <INFO> [toolkit]: Total ANY data: 3668[0m
2024-06-09 21:05:17,039[32m <INFO> [toolkit]: Max length of input: 112[0m
2024-06-09 21:05:17,040[32m <INFO> [toolkit]: Max length of label: 1[0m


In [44]:
# from toolkit.training import Evaluator
from model.MatchModel_binary_classification import BertModel_binary_classify, RobertaModel_binary_classify, RobertaModel_rephrase, BertModel_rephrase

from toolkit.enums import Split
from utils.evaluate import Evaluator1

if "roberta" in config.model_type:
    model = RobertaModel_rephrase.from_pretrained(model_path)
else:
    model = BertModel_rephrase.from_pretrained(model_path)
config.batch_size_infer = 64
Evaluator1.confused_use_ot = True
Evaluator1.save_results = False
Evaluator1.just_return_metric = False
evaluator = Evaluator1(
    "classify",
    Split.ANY,
    config=config,
    model=model,
    tokenizer=tokenizer,
    dataset=dataset,
    # dataset_name=DatasetName[dataset_name],
    extral_args_evaluation={"is_train": False},
)
metric_dict, bad_cases, good_cases_idxs, controversial_cases, confused_cases, definite_cases, all_logits, all_labels = evaluator.eval(cuda_id=1)
print(metric_dict)

ANY:   0%|          | 0/58 [00:00<?, ?batch/s]

Consistent:
2891/3668  78.82%
acc: 96.58%	f1: 97.28%

Inconsistent: 
777/3668  21.18%
acc: 92.79%	f1: 95.58%

controversial:
333/3668  9.08%
acc: 92.79%	f1: 95.56%

confused:
444/3668  12.10%
acc: 92.79%	f1: 95.59%

definite:
2891/3668  78.82%
acc: 96.58%	f1: 97.28%

56.000000000000036
94.76
{'accuracy': 95.7742639040349, 'F1-score': 96.8386702019172, 'loss': 1.1851459392185868}


In [45]:
all_cases = dict()
all_cases.update(controversial_cases)
all_cases.update(confused_cases)
all_cases.update(definite_cases)
from sklearn.metrics import accuracy_score

all_labels = [value["labels"] for value in all_cases.values()]
all_preds = [value["ori_labels"][0] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds), 2))

all_preds_paraphrase = [value["ori_labels"][-1] for value in all_cases.values()]
print(round(accuracy_score(all_labels, all_preds_paraphrase), 2))

print(round(accuracy_score(all_preds, all_preds_paraphrase), 2))

0.96
0.84
0.86
