In [117]:
import re, typing, jieba, pypinyin, random

In [73]:
def print_dataset(dataset: typing.List[typing.Tuple[str, str]], n: int = 5):
    print(f"Data samples (length = {len(dataset)}): [")
    for i in range(n):
        print(f'\t{dataset[i]},')
    print(f', ...]')

## 压缩数据集

In [None]:
def parse_bznsyp_dataset(filename: str):
    dataset: typing.List[typing.Tuple[str, str]]  = []
    regex = re.compile('#\d+')
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):
            splitpos = lines[i].index('\t')
            text = lines[i][splitpos+1:].strip('\n')
            text = regex.sub("", text)

            phome = lines[i + 1].strip('\n').strip()
            dataset.append((text, phome))
    return dataset

bznsyp_ds = parse_bznsyp_dataset("assets/text/BZNSYP/000001-010000.txt")
print(f"BZNSYP has {len(bznsyp_ds)} datas.")
print_dataset(bznsyp_ds)

BZNSYP has 10000 datas.
Data samples: [
	('卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1'),
	('假语村言别再拥抱我。', 'jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3'),
	('宝马配挂跛骡鞍，貂蝉怨枕董翁榻。', 'bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4'),
	('邓小平与撒切尔会晤。', 'deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4'),
	('老虎幼崽与宠物犬玩耍。', 'lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3'),
, ...]


In [34]:
def analysis_token_frequency(dataset: typing.List[typing.Tuple[str, str]]):
    maps: typing.Dict[str, typing.Set[int]] = {} # { token: [line numbers] }
    for i, (text, phome) in enumerate(dataset):
        for token in phome.split(' '):
            if token in maps:
                maps[token].add(i)
            else:
                maps[token] = set([i])
    return maps

bznsyp_analysis = analysis_token_frequency(bznsyp_ds)
bznsyp_tokens_counts = sorted([ (key, len(value)) for key, value in bznsyp_analysis.items() ], key=lambda x: x[1])
print(f"BZNSYP has {len(bznsyp_analysis)} tokens.")
print(f"Most active tokens: {bznsyp_tokens_counts[-5:]}")
print(f"Deactive tokens: {bznsyp_tokens_counts[:5]}")

BZNSYP has 1607 tokens.
Most active tokens: [('zai4', 1428), ('le5', 1530), ('yi4', 1653), ('shi4', 2512), ('de5', 4219)]
Deactive tokens: [('wanr1', 1), ('jir1', 1), ('jiang5', 1), ('tie4', 1), ('zuo5', 1)]


In [5]:
def parse_token_file(filename: str):
    token_list: typing.List[str] = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            token = line.strip().strip('\n').split(' ')[0]
            token_list.append(token)
    return token_list

tokens = parse_token_file("assets/dataset/tokens.txt")
print(f"Got pretrained tokens: {len(tokens)}")

Got pretrained tokens: 2069


In [35]:
bznsyp_set = set(list(bznsyp_analysis.keys()))
tokens_set = set(tokens)

print(f"Tokens that BZNSYP has but `tokens` hasn't: {len(bznsyp_set - tokens_set)}")
print(f"Tokens that `tokens` has but BZNSYP hasn't: {len(tokens_set - bznsyp_set)}")
print(f"Tokens that both have: {len(bznsyp_set & tokens_set)}")

Tokens that BZNSYP has but `tokens` hasn't: 306
Tokens that `tokens` has but BZNSYP hasn't: 768
Tokens that both have: 1301


In [103]:
def fix_bznsyp_tokens(tokens: typing.List[str]):
    ret: typing.List[str] = []
    for token in tokens:
        if token.endswith('5'): # 5音是轻声
            ret.append(token[:-1])
        elif len(token) >= 3 and token[-2] == 'r' and token[:2] != "er" and not token.startswith('letter'): # 儿音， fur4 => fu4 er2
            ret.append(token[:-2])
            ret.append('er2')
        elif token in ["。", "，", "—", "“", "”", "？", "！", "：", "、", "；", "…"]:
            continue # ignore
        else:
            ret.append(token)
    return ret

In [61]:
def fit_dataset_to_token_set(dataset: typing.List[typing.Tuple[str, str]], tokens_set: typing.Set[str]):
    maps: typing.Dict[str, typing.Set[int]] = { x: set() for x in tokens_set } # { token: [line number] }
    abandon_token: typing.Set[str] = set()
    for i, (text, phome) in enumerate(dataset):
        prepare_to_add: typing.List[str] = []
        skip_this_row = False
        for token in fix_bznsyp_tokens(phome.split(' ')):
            if token not in tokens_set: 
                abandon_token.add(token)
                skip_this_row = True
                break
            else:
                prepare_to_add.append(token)
        if not skip_this_row:
            for token in prepare_to_add:
                maps[token].add(i)
    print(f"Abandon tokens:", abandon_token)
    return maps

token_to_bznsyp_line = fit_dataset_to_token_set(bznsyp_ds, tokens_set)
print(f"len: {len(token_to_bznsyp_line)}")

Abandon tokens: {'shei2', 'P', 'menr', 'zhei4', 'ng1', 'tei1', 'yir'}
len: 2069


In [62]:
def remove_non_character(text: str):
    return text.replace('，','').replace('。','').replace('—','').replace("“",'').replace("”",'').replace('？','').replace('！','').replace('：','').replace('！','').replace('、','').replace('；','').replace('…','')

In [87]:
def sort_by_length(dataset: typing.List[typing.Tuple[str, str]], fit_table: typing.Dict[str, typing.List[int]]):
    ret: typing.Dict[str, typing.List[int]] = {}
    for token, line_numbers in fit_table.items():
        scores = [ 0 for _ in range(len(line_numbers)) ]
        for i, line in enumerate(line_numbers):
            text = dataset[line][0]
            length = len(remove_non_character(text))
            scores[i] = length
        ret[token] = [ x[1] for x in sorted(zip(range(len(line_numbers)), line_numbers), key=lambda i: scores[i[0]], reverse=True) ]
    return ret

def sort_by_freq(dataset: typing.List[typing.Tuple[str, str]], fit_table: typing.Dict[str, typing.List[int]]):
    ret: typing.Dict[str, typing.List[int]] = {}
    for token, line_numbers in fit_table.items():
        scores = [ 0 for _ in range(len(line_numbers)) ]
        for i, line in enumerate(line_numbers):
            for token in fix_bznsyp_tokens(dataset[line][1].split(' ')):
                if token in fit_table:
                    if len(fit_table[token]) == 1: # 稀有的token需要更容易选中
                        scores[i] += 100
                    elif len(fit_table[token]) == 2:
                        scores[i] += 1
        ret[token] = [ x[1] for x in sorted(zip(range(len(line_numbers)), line_numbers), key=lambda i: scores[i[0]], reverse=True) ]
    return ret

def select_first_n(fit_table: typing.Dict[str, typing.List[int]], n: int):
    ret: typing.Dict[str, typing.List[int]] = {}
    for token, line_numbers in fit_table.items():
        for line in line_numbers[:n]:
            if token not in ret:
                ret[token] = [line]
            else:
                ret[token].append(line)
    return ret

def flatten_select_result(select_result: typing.Dict[str, typing.List[int]]):
    select_sets: typing.Set[int] = set()
    for key, value in select_result.items():
        for v in value:
            select_sets.add(v)
    return select_sets

In [88]:
sorted_bznsyp_ds = { key: list(value) for key, value in token_to_bznsyp_line.items() }
# sorted_bznsyp_ds = sort_by_length(bznsyp_ds, sorted_bznsyp_ds)
# sorted_bznsyp_ds = select_first_n(sorted_bznsyp_ds, 4)
sorted_bznsyp_ds = sort_by_freq(bznsyp_ds, sorted_bznsyp_ds)
sorted_bznsyp_ds = select_first_n(sorted_bznsyp_ds, 3)

selected_dataset = [ bznsyp_ds[i] for i in flatten_select_result(sorted_bznsyp_ds) ]
subset_set = set(list(analysis_token_frequency(selected_dataset).keys()))
print(f"New dataset token converage: {len(tokens_set & subset_set)} compare to original {len(tokens_set & bznsyp_set)}")
print(f"New dataset counts: {len(selected_dataset)} compare to original {len(bznsyp_ds)}")
print_dataset(selected_dataset)

New dataset token converage: 1215 compare to original 1301
New dataset counts: 950 compare to original 10000
Data samples (length = 950): [
	('卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1'),
	('苦涩的沙吹痛脸庞的感觉。', 'ku3 se4 de5 sha1 chui1 tong4 lian3 pang2 de5 gan3 jue2'),
	('宝马配挂跛骡鞍，貂蝉怨枕董翁榻。', 'bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4'),
	('在“三嬢”陈章淑眼里，媛媛懂事、嘴甜、聪明，见人就会喊。', 'zai4 san1 niang2 chen2 shu1 zhang1 yan2 li3 yuan2 yuan5 dong3 shi4 zui3 tian2 cong1 ming5 jian4 ren2 jiu4 hui4 han3'),
	('老虎幼崽与宠物犬玩耍。', 'lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3'),
, ...]


In [None]:
def add_extra_character_to_phomes(dataset: typing.List[typing.Tuple[str, str]]):
    modified_ds = dataset.copy()
    for k, (text, phome) in enumerate(dataset):
        newphome = []
        phomes = fix_bznsyp_tokens(phome.split(' '))
        debug_phomes = phomes.copy()
        skip_one = False
        unhandle = False
        for i, ch in enumerate(text):
            if skip_one:
                skip_one = False
                continue
            if ch in ["。", "，", "—", "“", "”", "？", "！", "：", "、", "；", "…"]:
                newphome.append(ch)
            else:
                if len(phomes) == 0:
                    unhandle = True
                    break
                p = phomes.pop(0)
                newphome.append(p)
        if len(phomes) == 0 and not unhandle:
            modified_ds[k] = (text, ' '.join(newphome))
        else:
            print(f"Unhandle: {k}\n== {text}\n== {debug_phomes}\nremain: {phomes}\n")
    return modified_ds

cleaned_ds = add_extra_character_to_phomes(selected_dataset)
print(f"Got final dataset.")
print_dataset(cleaned_ds)

Final dataset: [
	('卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1 。'),
	('苦涩的沙吹痛脸庞的感觉。', 'ku3 se4 de sha1 chui1 tong4 lian3 pang2 de gan3 jue2 。'),
	('宝马配挂跛骡鞍，貂蝉怨枕董翁榻。', 'bao2 ma3 pei4 gua4 bo3 luo2 an1 ， diao1 chan2 yuan4 zhen3 dong3 weng1 ta4 。'),
	('在“三嬢”陈章淑眼里，媛媛懂事、嘴甜、聪明，见人就会喊。', 'zai4 “ san1 niang2 ” chen2 shu1 zhang1 yan2 li3 ， yuan2 yuan dong3 shi4 、 zui3 tian2 、 cong1 ming ， jian4 ren2 jiu4 hui4 han3 。'),
	('老虎幼崽与宠物犬玩耍。', 'lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3 。'),
, ...]


In [109]:
def save_v2_dataset(filename: str, dataset: typing.List[typing.Tuple[str, str]]):
	with open(filename, 'w', encoding='utf-8') as f:
		for (text, phome) in dataset:
			f.write(f"{text}\n{phome}\n")

save_v2_dataset("assets/text/mandarin_v2_train.txt", cleaned_ds)
print("Done.")

Done.


## 融合v1数据集

In [80]:
def parse_v2_dataset(filename: str):
    dataset: typing.List[typing.Tuple[str, str]] = []
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):
            text = lines[i].strip().strip('\n')
            phome = lines[i+1].strip().strip('\n')
            dataset.append((text, phome))
    return dataset

cleaned_ds = parse_v2_dataset("assets/text/mandarin_v2_train.txt")
print_dataset(cleaned_ds)

Data samples (length = 950): [
	('卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1 。'),
	('苦涩的沙吹痛脸庞的感觉。', 'ku3 se4 de sha1 chui1 tong4 lian3 pang2 de gan3 jue2 。'),
	('宝马配挂跛骡鞍，貂蝉怨枕董翁榻。', 'bao2 ma3 pei4 gua4 bo3 luo2 an1 ， diao1 chan2 yuan4 zhen3 dong3 weng1 ta4 。'),
	('在“三嬢”陈章淑眼里，媛媛懂事、嘴甜、聪明，见人就会喊。', 'zai4 “ san1 niang2 ” chen2 shu1 zhang1 yan2 li3 ， yuan2 yuan dong3 shi4 、 zui3 tian2 、 cong1 ming ， jian4 ren2 jiu4 hui4 han3 。'),
	('老虎幼崽与宠物犬玩耍。', 'lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3 。'),
, ...]


In [74]:
def parse_v1_dataset(filename: str):
    dataset: typing.List[typing.Tuple[str, str]] = []
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f.readlines()):
            text = line.strip().strip('\n')
            if text == "": continue
            cuts = list(jieba.cut(text))
            phome = []
            for cut in cuts:
                pinyins = pypinyin.lazy_pinyin(cut, style=pypinyin.Style.TONE3, tone_sandhi=True)
                for pinyin in pinyins:
                    phome.append(pinyin)
            phome = ' '.join(phome)
            dataset.append((text, phome))
    return dataset

v1_ds = parse_v1_dataset("assets/text/mandarin.txt")
print_dataset(v1_ds)

Data samples (length = 729): [
	('他在家里吃苹果，喝茶，打电话，唱歌。', 'ta1 zai4 jia1 li3 chi1 ping2 guo3 ， he1 cha2 ， da3 dian4 hua4 ， chang4 ge1 。'),
	('我去商店买水果和书，看到小猫在玩。', 'wo3 qu4 shang1 dian4 mai3 shui3 guo3 he2 shu1 ， kan4 dao4 xiao3 mao1 zai4 wan2 。'),
	('小明去北京吃炸酱面，他在公园里看见了许多种类的鸟，甚至还遇到了一只大猩猩。', 'xiao3 ming2 qu4 bei3 jing1 chi1 zha2 jiang4 mian4 ， ta1 zai4 gong1 yuan2 li3 kan4 jian4 le xu3 duo1 zhong3 lei4 de niao3 ， shen4 zhi4 hai2 yu4 dao4 le yi4 zhi1 da4 xing1 xing1 。'),
	('小猫学会了跳舞，爬上了高高的树。', 'xiao3 mao1 xue2 hui4 le tiao4 wu3 ， pa2 shang4 le gao1 gao1 de shu4 。'),
	('风筝在天空中飞得又高又快，真是太漂亮了。', 'feng1 zheng1 zai4 tian1 kong1 zhong1 fei1 de2 you4 gao1 you4 kuai4 ， zhen1 shi4 tai4 piao4 liang4 le 。'),
, ...]


In [75]:
v1_ds_set = set(list(analysis_token_frequency(v1_ds).keys()))
print(f"Token converage: {len(tokens_set & v1_ds_set)}")

Token converage: 779


In [81]:
fit_table_v1 = fit_dataset_to_token_set(v1_ds, tokens_set)
fit_table_cleaned_ds = fit_dataset_to_token_set(cleaned_ds, tokens_set)

Abandon tokens: {'5G', 'AI', 'GDPR'}
Abandon tokens: set()


In [90]:
append_v1_to_v2_tokens: typing.List[str] = []
for token, lines in fit_table_v1.items():
    if len(fit_table_cleaned_ds[token]) == 0:
        append_v1_to_v2_tokens.append(token)
print(f"Merge {len(append_v1_to_v2_tokens)} tokens from v1 to v2.")
print(f"Tokens: {append_v1_to_v2_tokens}")

Merge 641 tokens from v1 to v2.
Tokens: ['hei3', 'bin', 'hua3', 'ri3', 'nun1', 'dong2', 'wai', 'ha3', 'qiong3', 'zheng', 'ce1', 'kuan', 'reng', 'tu', 'han', 'gen2', 'zan', 'shai1', 'rui4', 'qun4', 'cen1', 'pie4', 'se2', 'zun3', 'm2', 'qiong1', 'rong1', 'za4', 'sen', 'jiong1', 'neng', 'nuo3', 'lve1', 'nian1', 'ga', 'hm2', 'run1', 'ping3', 'cang3', 'keng4', 'wu', 'gao', 'rua4', 'zeng4', 'dia4', 'mou4', 'niao', 'lan1', 'zan1', 'yo3', 'xiang', 'shuang3', 'cong4', 'nou3', 'hm4', 'lu', 'lu1', 'nin', 'men3', 'pang4', 'yue3', 'hong4', 'zei4', 'zuan', 'lai3', 'seng2', 'mie3', 'an', 'ruan1', 'cun', 'nu', 'nv1', 'yao', 'chui3', 'te', 'zen', 'nie2', 'cen2', 'rua2', 'eng1', 'kui', 'wai1', 'zhuo', 'mou1', 'bing', 'ge3', 'rong3', 'cou2', 'den1', 'mang', 'mie2', 'chun4', 'ken1', 'ceng', 'se3', 'n4', 'chuo2', 'nuo1', 'rang3', 'zun2', 'shuo3', 'qia', 'hen', 'duan2', 'cu1', 'teng4', 'hei', 'diu3', 'san', 'ce', 'kang', 'nang3', ':', 'shua4', 'de1', 'nie', 'rao1', 'chua1', 'qiao', 'zhun4', 'ang', 'xin3', '

In [93]:
sub_fit_table = { key: list(value) for key, value in fit_table_v1.items() if key in append_v1_to_v2_tokens }
sub_fit_table = sort_by_freq(v1_ds, sub_fit_table)
sub_fit_table = select_first_n(sub_fit_table, 3)
sub_v1_ds = [ v1_ds[i] for i in flatten_select_result(sub_fit_table) ]
print_dataset(sub_v1_ds)

Data samples (length = 24): [
	('明天我们会去爬山，打算带上水和干粮，准备好一整天的活动。', 'ming2 tian1 wo3 men hui4 qu4 pa2 shan1 ， da3 suan4 dai4 shang4 shui3 he2 gan1 liang2 ， zhun3 bei4 hao3 yi1 zheng3 tian1 de huo2 dong4 。'),
	('他们家的花园里种了很多蔬菜和水果，看起来非常绿意盎然。', 'ta1 men jia1 de hua1 yuan2 li3 zhong3 le hen3 duo1 shu1 cai4 he2 shui3 guo3 ， kan4 qi3 lai2 fei1 chang2 lv4 yi4 ang4 ran2 。'),
	('小猫学会了跳舞，爬上了高高的树。', 'xiao3 mao1 xue2 hui4 le tiao4 wu3 ， pa2 shang4 le gao1 gao1 de shu4 。'),
	('我真心希望你能够理解我，尽管我很难表达。', 'wo3 zhen1 xin1 xi1 wang4 ni3 neng2 gou4 li3 jie3 wo3 ， jin2 guan3 wo3 hen3 nan2 biao3 da2 。'),
	('心血管疾病是全球致死率最高的疾病之一，早期筛查至关重要。', 'xin1 xue4 guan3 ji2 bing4 shi4 quan2 qiu2 zhi4 si3 lv4 zui4 gao1 de ji2 bing4 zhi1 yi1 ， zao3 qi1 shai1 cha2 zhi4 guan1 zhong4 yao4 。'),
, ...]


In [None]:
save_v2_dataset("assets/text/mandarin_v2_noletters.txt", sub_v1_ds)

## 融合cross letters数据集

In [98]:
crossletters_ds = parse_v1_dataset("assets/text/crossletters.txt")
print_dataset(crossletters_ds)

Data samples (length = 21): [
	('从A到B的计划需要C和D的配合，以确保E可以成功执行。', 'cong2 A dao4 B de ji4 hua4 xu1 yao4 C he2 D de pei4 he2 ， yi3 que4 bao3 E ke2 yi3 cheng2 gong1 zhi2 xing2 。'),
	('你从F到G的过程中，H的支持是不可缺少的，尤其是在I阶段。', 'ni3 cong2 F dao4 G de guo4 cheng2 zhong1 ， H de zhi1 chi2 shi4 bu4 ke3 que1 shao3 de ， you2 qi2 shi4 zai4 I jie1 duan4 。'),
	('在项目进行中，J和K的合作将直接影响C的效率。', 'zai4 xiang4 mu4 jin4 xing2 zhong1 ， J he2 K de he2 zuo4 jiang1 zhi2 jie1 ying3 xiang3 C de xiao4 lv4 。'),
	('如果你想提高L到M的速度，N的调整将非常重要。', 'ru2 guo3 ni3 xiang3 ti2 gao1 L dao4 M de su4 du4 ， N de tiao2 zheng3 jiang1 fei1 chang2 zhong4 yao4 。'),
	('在这次活动中，O的出现为P带来了更大的机会。', 'zai4 zhe4 ci4 huo2 dong4 zhong1 ， O de chu1 xian4 wei4 P dai4 lai2 le geng4 da4 de ji1 hui4 。'),
, ...]


In [99]:
for i in range(len(crossletters_ds)):
    text = crossletters_ds[i][0]
    phome = crossletters_ds[i][1]
    newphome = []
    for x in phome.split(' '):
        if len(x) == 1 and ord('A') <= ord(x[0]) and ord(x[0]) <= ord('Z'):
            newphome.append(f'letter' + x)
        else:
            newphome.append(x)
    crossletters_ds[i] = (text, ' '.join(newphome))
print_dataset(crossletters_ds)

Data samples (length = 21): [
	('从A到B的计划需要C和D的配合，以确保E可以成功执行。', 'cong2 letterA dao4 letterB de ji4 hua4 xu1 yao4 letterC he2 letterD de pei4 he2 ， yi3 que4 bao3 letterE ke2 yi3 cheng2 gong1 zhi2 xing2 。'),
	('你从F到G的过程中，H的支持是不可缺少的，尤其是在I阶段。', 'ni3 cong2 letterF dao4 letterG de guo4 cheng2 zhong1 ， letterH de zhi1 chi2 shi4 bu4 ke3 que1 shao3 de ， you2 qi2 shi4 zai4 letterI jie1 duan4 。'),
	('在项目进行中，J和K的合作将直接影响C的效率。', 'zai4 xiang4 mu4 jin4 xing2 zhong1 ， letterJ he2 letterK de he2 zuo4 jiang1 zhi2 jie1 ying3 xiang3 letterC de xiao4 lv4 。'),
	('如果你想提高L到M的速度，N的调整将非常重要。', 'ru2 guo3 ni3 xiang3 ti2 gao1 letterL dao4 letterM de su4 du4 ， letterN de tiao2 zheng3 jiang1 fei1 chang2 zhong4 yao4 。'),
	('在这次活动中，O的出现为P带来了更大的机会。', 'zai4 zhe4 ci4 huo2 dong4 zhong1 ， letterO de chu1 xian4 wei4 letterP dai4 lai2 le geng4 da4 de ji1 hui4 。'),
, ...]


## 融合后再压缩

In [104]:
total_ds: typing.List[typing.Tuple[str, str]] = []
total_ds.extend(cleaned_ds)
total_ds.extend(sub_v1_ds)
total_ds.extend(crossletters_ds)
print_dataset(total_ds)

Data samples (length = 995): [
	('卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1 。'),
	('苦涩的沙吹痛脸庞的感觉。', 'ku3 se4 de sha1 chui1 tong4 lian3 pang2 de gan3 jue2 。'),
	('宝马配挂跛骡鞍，貂蝉怨枕董翁榻。', 'bao2 ma3 pei4 gua4 bo3 luo2 an1 ， diao1 chan2 yuan4 zhen3 dong3 weng1 ta4 。'),
	('在“三嬢”陈章淑眼里，媛媛懂事、嘴甜、聪明，见人就会喊。', 'zai4 “ san1 niang2 ” chen2 shu1 zhang1 yan2 li3 ， yuan2 yuan dong3 shi4 、 zui3 tian2 、 cong1 ming ， jian4 ren2 jiu4 hui4 han3 。'),
	('老虎幼崽与宠物犬玩耍。', 'lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3 。'),
, ...]


In [110]:
new_tokens_set = tokens_set.copy()
for i in range(ord('A'), ord('Z') + 1):
    new_tokens_set.add(f'letter' + chr(i)) # crossletters中新增的tokens
print(f"Token counts: {len(new_tokens_set)}")

Token counts: 2095


In [106]:
fit_table_final = fit_dataset_to_token_set(total_ds, new_tokens_set)
clean2_ds = { key: list(value) for key, value in fit_table_final.items() }
clean2_ds = sort_by_freq(total_ds, clean2_ds)
clean2_ds = select_first_n(clean2_ds, 3)
final_ds = [ total_ds[i] for i in flatten_select_result(clean2_ds) ]
print_dataset(final_ds)

Abandon tokens: set()
Data samples (length = 574): [
	('卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1 。'),
	('宝马配挂跛骡鞍，貂蝉怨枕董翁榻。', 'bao2 ma3 pei4 gua4 bo3 luo2 an1 ， diao1 chan2 yuan4 zhen3 dong3 weng1 ta4 。'),
	('在“三嬢”陈章淑眼里，媛媛懂事、嘴甜、聪明，见人就会喊。', 'zai4 “ san1 niang2 ” chen2 shu1 zhang1 yan2 li3 ， yuan2 yuan dong3 shi4 、 zui3 tian2 、 cong1 ming ， jian4 ren2 jiu4 hui4 han3 。'),
	('老虎幼崽与宠物犬玩耍。', 'lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3 。'),
	('莫里斯这番赤裸裸的种族主义言论遭到舆论痛批。', 'mo4 li3 si1 zhe4 fan1 chi4 luo2 luo3 de zhong3 zu2 zhu3 yi4 yan2 lun4 zao1 dao4 yu2 lun4 tong4 pi1 。'),
, ...]


In [108]:
final_ds_set = set(list(analysis_token_frequency(final_ds).keys()))
print(f"Token converage: {len(new_tokens_set & final_ds_set)}")

Token converage: 1393


In [112]:
save_v2_dataset("assets/text/mandarin_v2_train_withletters.txt", final_ds)

## 制作验证集

In [116]:
train_texts: typing.Set[str] = set()
for (text, phome) in final_ds:
    train_texts.add(text)
select_range: typing.List[int] = []
for i, (text, phome) in enumerate(bznsyp_ds):
    if text not in train_texts:
        select_range.append(i)
print(f"Selective range: {len(select_range)}")

Selective range: 9453


In [118]:
select_idx = random.sample(select_range, 30)
valid_ds = [ bznsyp_ds[i] for i in select_idx ]
print_dataset(valid_ds)

Data samples (length = 30): [
	('凶徒们将姜老板和女儿女婿捆了起来，逼他们交钱。', 'xiong1 tu2 men5 jiang1 jiang1 lao2 ban3 he2 nv3 er2 nv3 xu4 kun3 le5 qi3 lai2 bi1 ta1 men5 jiao1 qian2'),
	('警方初步调查，夫妻俩死于一氧化碳中毒。', 'jing3 fang1 chu1 bu4 diao4 cha2 fu1 qi1 lia2 si3 yu2 yi1 yang3 hua4 tan4 zhong4 du2'),
	('首先，记者拨打的是约车电话九六幺零六。', 'shou3 xian1 ji4 zhe3 bo1 da3 de5 shi4 yue1 che1 dian4 hua4 jiu3 liu4 yao1 ling2 liu4'),
	('而“苹果”等纸媒，冗员也不少。', 'er2 ping2 guo3 deng2 zhi3 mei2 rong3 yuan2 ye3 bu4 shao3'),
	('人聪明点，毕竟以后我家的生意，都是要交给女婿管理的。', 'ren2 cong1 ming5 dian3 bi4 jing4 yi3 hou4 wo3 jia1 de5 sheng1 yi4 dou1 shi4 yao4 jiao1 gei2 nv3 xu4 guan2 li3 de5'),
, ...]


In [119]:
save_v2_dataset("assets/text/mandarin_v2_valid.txt", valid_ds)