In [1]:
import os, re, typing, jieba, pypinyin, random

In [2]:
def print_dataset(dataset: typing.List[typing.Tuple[str, str]], n: int = 5):
    print(f"Data samples (length = {len(dataset)}): [")
    for i in range(min(len(dataset), n)):
        print(f'\t{dataset[i]},')
    print(f', ...]')

## 压缩数据集

In [6]:
def parse_bznsyp_dataset(filename: str):
    dataset: typing.List[typing.Tuple[str, str]]  = []
    regex = re.compile('#\d+')
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):
            splitpos = lines[i].index('\t')
            text = lines[i][splitpos+1:].strip('\n')
            text = regex.sub("", text)

            phome = lines[i + 1].strip('\n').strip()
            dataset.append((text, phome))
    return dataset

bznsyp_ds = parse_bznsyp_dataset("assets/text/BZNSYP/000001-010000.txt")
print(f"BZNSYP has {len(bznsyp_ds)} datas.")
print_dataset(bznsyp_ds)

BZNSYP has 10000 datas.
Data samples (length = 10000): [
	('卡尔普陪外孙玩滑梯。', 'ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1'),
	('假语村言别再拥抱我。', 'jia2 yu3 cun1 yan2 bie2 zai4 yong1 bao4 wo3'),
	('宝马配挂跛骡鞍，貂蝉怨枕董翁榻。', 'bao2 ma3 pei4 gua4 bo3 luo2 an1 diao1 chan2 yuan4 zhen3 dong3 weng1 ta4'),
	('邓小平与撒切尔会晤。', 'deng4 xiao3 ping2 yu3 sa4 qie4 er3 hui4 wu4'),
	('老虎幼崽与宠物犬玩耍。', 'lao2 hu3 you4 zai3 yu2 chong3 wu4 quan3 wan2 shua3'),
, ...]


In [5]:
def analysis_token_frequency(dataset: typing.List[typing.Tuple[str, str]]):
    maps: typing.Dict[str, typing.Set[int]] = {} # { token: [line numbers] }
    for i, (text, phome) in enumerate(dataset):
        for token in phome.split(' '):
            if token in maps:
                maps[token].add(i)
            else:
                maps[token] = set([i])
    return maps

bznsyp_analysis = analysis_token_frequency(bznsyp_ds)
bznsyp_tokens_counts = sorted([ (key, len(value)) for key, value in bznsyp_analysis.items() ], key=lambda x: x[1])
print(f"BZNSYP has {len(bznsyp_analysis)} tokens.")
print(f"Most active tokens: {bznsyp_tokens_counts[-5:]}")
print(f"Deactive tokens: {bznsyp_tokens_counts[:5]}")

BZNSYP has 1607 tokens.
Most active tokens: [('zai4', 1428), ('le5', 1530), ('yi4', 1653), ('shi4', 2512), ('de5', 4219)]
Deactive tokens: [('wanr1', 1), ('jir1', 1), ('jiang5', 1), ('tie4', 1), ('zuo5', 1)]


In [6]:
def parse_token_file(filename: str):
    token_list: typing.List[str] = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            token = line.strip().strip('\n').split(' ')[0]
            token_list.append(token)
    return token_list

tokens = parse_token_file("assets/dataset/tokens.txt")
print(f"Got pretrained tokens: {len(tokens)}")

Got pretrained tokens: 2069


In [7]:
bznsyp_set = set(list(bznsyp_analysis.keys()))
tokens_set = set(tokens)

print(f"Tokens that BZNSYP has but `tokens` hasn't: {len(bznsyp_set - tokens_set)}")
print(f"Tokens that `tokens` has but BZNSYP hasn't: {len(tokens_set - bznsyp_set)}")
print(f"Tokens that both have: {len(bznsyp_set & tokens_set)}")

Tokens that BZNSYP has but `tokens` hasn't: 306
Tokens that `tokens` has but BZNSYP hasn't: 768
Tokens that both have: 1301


In [11]:
def fix_bznsyp_tokens(tokens: typing.List[str]):
    ret: typing.List[str] = []
    for token in tokens:
        if token.endswith('5'): # 5音是轻声
            ret.append(token[:-1])
        elif len(token) >= 3 and token[-2] == 'r' and token[:2] != "er" and not token.startswith('letter'): # 儿音， fur4 => fu4 er2
            ret.append(token[:-2])
            ret.append('er2')
        elif token in ["。", "，", "—", "“", "”", "？", "！", "：", "、", "；", "…"]:
            continue # ignore
        else:
            ret.append(token)
    return ret

In [9]:
def fit_dataset_to_token_set(dataset: typing.List[typing.Tuple[str, str]], tokens_set: typing.Set[str]):
    maps: typing.Dict[str, typing.Set[int]] = { x: set() for x in tokens_set } # { token: [line number] }
    abandon_token: typing.Set[str] = set()
    for i, (text, phome) in enumerate(dataset):
        prepare_to_add: typing.List[str] = []
        skip_this_row = False
        for token in fix_bznsyp_tokens(phome.split(' ')):
            if token not in tokens_set: 
                abandon_token.add(token)
                skip_this_row = True
                break
            else:
                prepare_to_add.append(token)
        if not skip_this_row:
            for token in prepare_to_add:
                maps[token].add(i)
    print(f"Abandon tokens:", abandon_token)
    return maps

token_to_bznsyp_line = fit_dataset_to_token_set(bznsyp_ds, tokens_set)
print(f"len: {len(token_to_bznsyp_line)}")

Abandon tokens: {'P', 'shei2', 'menr', 'zhei4', 'tei1', 'yir', 'ng1'}
len: 2069


In [10]:
def remove_non_character(text: str):
    return text.replace('，','').replace('。','').replace('—','').replace("“",'').replace("”",'').replace('？','').replace('！','').replace('：','').replace('！','').replace('、','').replace('；','').replace('…','')

In [11]:
def sort_by_length(dataset: typing.List[typing.Tuple[str, str]], fit_table: typing.Dict[str, typing.List[int]]):
    ret: typing.Dict[str, typing.List[int]] = {}
    for token, line_numbers in fit_table.items():
        scores = [ 0 for _ in range(len(line_numbers)) ]
        for i, line in enumerate(line_numbers):
            text = dataset[line][0]
            length = len(remove_non_character(text))
            scores[i] = length
        ret[token] = [ x[1] for x in sorted(zip(range(len(line_numbers)), line_numbers), key=lambda i: scores[i[0]], reverse=True) ]
    return ret

def sort_by_freq(dataset: typing.List[typing.Tuple[str, str]], fit_table: typing.Dict[str, typing.List[int]]):
    ret: typing.Dict[str, typing.List[int]] = {}
    for token, line_numbers in fit_table.items():
        scores = [ 0 for _ in range(len(line_numbers)) ]
        for i, line in enumerate(line_numbers):
            for token in fix_bznsyp_tokens(dataset[line][1].split(' ')):
                if token in fit_table:
                    if len(fit_table[token]) == 1: # 稀有的token需要更容易选中
                        scores[i] += 100
                    elif len(fit_table[token]) == 2:
                        scores[i] += 1
        ret[token] = [ x[1] for x in sorted(zip(range(len(line_numbers)), line_numbers), key=lambda i: scores[i[0]], reverse=True) ]
    return ret

def select_first_n(fit_table: typing.Dict[str, typing.List[int]], n: int):
    ret: typing.Dict[str, typing.List[int]] = {}
    for token, line_numbers in fit_table.items():
        for line in line_numbers[:n]:
            if token not in ret:
                ret[token] = [line]
            else:
                ret[token].append(line)
    return ret

def flatten_select_result(select_result: typing.Dict[str, typing.List[int]]):
    select_sets: typing.Set[int] = set()
    for key, value in select_result.items():
        for v in value:
            select_sets.add(v)
    return select_sets

In [16]:
sorted_bznsyp_ds = { key: list(value) for key, value in token_to_bznsyp_line.items() }
sorted_bznsyp_ds = sort_by_length(bznsyp_ds, sorted_bznsyp_ds)
sorted_bznsyp_ds = select_first_n(sorted_bznsyp_ds, 6)
sorted_bznsyp_ds = sort_by_freq(bznsyp_ds, sorted_bznsyp_ds)
sorted_bznsyp_ds = select_first_n(sorted_bznsyp_ds, 3)

selected_dataset = [ bznsyp_ds[i] for i in flatten_select_result(sorted_bznsyp_ds) ]
subset_set = set(list(analysis_token_frequency(selected_dataset).keys()))
print(f"New dataset token converage: {len(tokens_set & subset_set)} compare to original {len(tokens_set & bznsyp_set)}")
print(f"New dataset counts: {len(selected_dataset)} compare to original {len(bznsyp_ds)}")
print_dataset(selected_dataset)

New dataset token converage: 1231 compare to original 1301
New dataset counts: 1038 compare to original 10000
Data samples (length = 1038): [
	('进入秋季寡雨季节以来，泉州、漳州、莆田等沿海地区旱情明显。', 'jin4 ru4 qiu1 ji4 gua2 yu3 ji4 jie2 yi3 lai2 quan2 zhou1 zhang1 zhou1 pu2 tian2 deng3 yan2 hai3 di4 qu1 han4 qing2 ming2 xian3'),
	('在远处依稀可见的沙漠植物映衬下，整个球场显的绿意盎然。', 'zai4 yuan3 chu4 yi1 xi1 ke3 jian4 de5 sha1 mo4 zhi2 wu4 ying4 chen4 xia4 zheng3 ge5 qiu2 chang3 xian3 de5 lv4 yi4 ang4 ran2'),
	('由此，方体忠想到了姑奶张淑云，多次请其跟王文利“说说”。', 'you2 ci3 fang1 ti3 zhong1 xiang3 dao4 le5 gu1 nai3 zhang1 shu1 yun2 duo1 ci4 qing3 qi2 gen1 wang2 wen2 li4 shuo1 shuo5'),
	('一路上，孙颖浩用自己的方式鼓舞着队员，其实他的心里也忐忑不安。', 'yi2 lu4 shang4 sun1 ying3 hao4 yong4 zi4 ji3 de5 fang1 shi4 gu2 wu3 zhe5 dui4 yuan2 qi2 shi2 ta1 de5 xin1 li3 ye2 tan3 te4 bu4 an1'),
	('写散文的人最多，人心却像他们的文章一样散，闹也闲不出气势。', 'xie2 san3 wen2 de5 ren2 zui4 duo1 ren2 xin1 que4 xiang4 ta1 men5 de5 wen2 zhang1 yi2 yang4 san3 nao4 ye3 xian2 bu4 chu1 qi4 shi4'),
, ...]


In [9]:
def add_extra_character_to_phomes(dataset: typing.List[typing.Tuple[str, str]]):
    modified_ds = dataset.copy()
    for k, (text, phome) in enumerate(dataset):
        newphome = []
        phomes = fix_bznsyp_tokens(phome.split(' '))
        debug_phomes = phomes.copy()
        skip_one = False
        unhandle = False
        for i, ch in enumerate(text):
            if skip_one:
                skip_one = False
                continue
            if ch in ["。", "，", "—", "“", "”", "？", "！", "：", "、", "；", "…"]:
                newphome.append(ch)
            else:
                if len(phomes) == 0:
                    unhandle = True
                    break
                p = phomes.pop(0)
                newphome.append(p)
        if len(phomes) == 0 and not unhandle:
            modified_ds[k] = (text, ' '.join(newphome))
        else:
            print(f"Unhandle: {k}\n== {text}\n== {debug_phomes}\nremain: {phomes}\n")
    return modified_ds

In [None]:
cleaned_ds = add_extra_character_to_phomes(selected_dataset)
print(f"Got final dataset.")
print_dataset(cleaned_ds)

Got final dataset.
Data samples (length = 1038): [
	('进入秋季寡雨季节以来，泉州、漳州、莆田等沿海地区旱情明显。', 'jin4 ru4 qiu1 ji4 gua2 yu3 ji4 jie2 yi3 lai2 ， quan2 zhou1 、 zhang1 zhou1 、 pu2 tian2 deng3 yan2 hai3 di4 qu1 han4 qing2 ming2 xian3 。'),
	('在远处依稀可见的沙漠植物映衬下，整个球场显的绿意盎然。', 'zai4 yuan3 chu4 yi1 xi1 ke3 jian4 de sha1 mo4 zhi2 wu4 ying4 chen4 xia4 ， zheng3 ge qiu2 chang3 xian3 de lv4 yi4 ang4 ran2 。'),
	('由此，方体忠想到了姑奶张淑云，多次请其跟王文利“说说”。', 'you2 ci3 ， fang1 ti3 zhong1 xiang3 dao4 le gu1 nai3 zhang1 shu1 yun2 ， duo1 ci4 qing3 qi2 gen1 wang2 wen2 li4 “ shuo1 shuo ” 。'),
	('一路上，孙颖浩用自己的方式鼓舞着队员，其实他的心里也忐忑不安。', 'yi2 lu4 shang4 ， sun1 ying3 hao4 yong4 zi4 ji3 de fang1 shi4 gu2 wu3 zhe dui4 yuan2 ， qi2 shi2 ta1 de xin1 li3 ye2 tan3 te4 bu4 an1 。'),
	('写散文的人最多，人心却像他们的文章一样散，闹也闲不出气势。', 'xie2 san3 wen2 de ren2 zui4 duo1 ， ren2 xin1 que4 xiang4 ta1 men de wen2 zhang1 yi2 yang4 san3 ， nao4 ye3 xian2 bu4 chu1 qi4 shi4 。'),
, ...]


In [14]:
def save_v2_dataset(filename: str, dataset: typing.List[typing.Tuple[str, str]]):
	with open(filename, 'w', encoding='utf-8') as f:
		for (text, phome) in dataset:
			f.write(f"{text}\n{phome}\n")

In [None]:
save_v2_dataset("assets/text/mandarin_v2_train.txt", cleaned_ds)
print("Done.")

Done.


## 融合v1数据集

In [4]:
def parse_v2_dataset(filename: str):
    dataset: typing.List[typing.Tuple[str, str]] = []
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):
            text = lines[i].strip().strip('\n')
            phome = lines[i+1].strip().strip('\n')
            dataset.append((text, phome))
    return dataset

def parse_v1_dataset(filename: str):
    dataset: typing.List[typing.Tuple[str, str]] = []
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f.readlines()):
            text = line.strip().strip('\n')
            if text == "": continue
            cuts = list(jieba.cut(text))
            phome = []
            for cut in cuts:
                pinyins = pypinyin.lazy_pinyin(cut, style=pypinyin.Style.TONE3, tone_sandhi=True)
                for pinyin in pinyins:
                    phome.append(pinyin)
            phome = ' '.join(phome)
            dataset.append((text, phome))
    return dataset

In [None]:
cleaned_ds = parse_v2_dataset("assets/text/mandarin_v2_train.txt")
print_dataset(cleaned_ds)

FileNotFoundError: [Errno 2] No such file or directory: 'assets/text/mandarin_v2_train.txt'

In [None]:
v1_ds = parse_v1_dataset("assets/text/mandarin.txt")
print_dataset(v1_ds)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CHENHA~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.423 seconds.
Prefix dict has been built successfully.


Data samples (length = 729): [
	('他在家里吃苹果，喝茶，打电话，唱歌。', 'ta1 zai4 jia1 li3 chi1 ping2 guo3 ， he1 cha2 ， da3 dian4 hua4 ， chang4 ge1 。'),
	('我去商店买水果和书，看到小猫在玩。', 'wo3 qu4 shang1 dian4 mai3 shui3 guo3 he2 shu1 ， kan4 dao4 xiao3 mao1 zai4 wan2 。'),
	('小明去北京吃炸酱面，他在公园里看见了许多种类的鸟，甚至还遇到了一只大猩猩。', 'xiao3 ming2 qu4 bei3 jing1 chi1 zha2 jiang4 mian4 ， ta1 zai4 gong1 yuan2 li3 kan4 jian4 le xu3 duo1 zhong3 lei4 de niao3 ， shen4 zhi4 hai2 yu4 dao4 le yi4 zhi1 da4 xing1 xing1 。'),
	('小猫学会了跳舞，爬上了高高的树。', 'xiao3 mao1 xue2 hui4 le tiao4 wu3 ， pa2 shang4 le gao1 gao1 de shu4 。'),
	('风筝在天空中飞得又高又快，真是太漂亮了。', 'feng1 zheng1 zai4 tian1 kong1 zhong1 fei1 de2 you4 gao1 you4 kuai4 ， zhen1 shi4 tai4 piao4 liang4 le 。'),
, ...]


In [19]:
v1_ds_set = set(list(analysis_token_frequency(v1_ds).keys()))
print(f"Token converage: {len(tokens_set & v1_ds_set)}")

Token converage: 779


In [20]:
fit_table_v1 = fit_dataset_to_token_set(v1_ds, tokens_set)
fit_table_cleaned_ds = fit_dataset_to_token_set(cleaned_ds, tokens_set)

Abandon tokens: {'AI', 'GDPR', '5G'}
Abandon tokens: set()


In [21]:
append_v1_to_v2_tokens: typing.List[str] = []
for token, lines in fit_table_v1.items():
    if len(fit_table_cleaned_ds[token]) == 0:
        append_v1_to_v2_tokens.append(token)
print(f"Merge {len(append_v1_to_v2_tokens)} tokens from v1 to v2.")
print(f"Tokens: {append_v1_to_v2_tokens}")

Merge 646 tokens from v1 to v2.
Tokens: ['pen', 'chun4', 'qiong1', 'bin', 'm', 'fo1', 'lei1', 'ri', 'nie', 'chun', 'o2', 'ce', 'eng', 'bi', 'juan3', 'lv1', 'nve1', 'rua', 'tu', 'nun4', 'dei4', 'lan1', 'kuan4', 'pian3', 'hang3', 'dun3', 'ruo1', 'xiu2', 'lin1', 'chuang', 'cuo', 'nve', 'n1', 'le2', 'lu1', 'neng4', 'gei', 'pou', 'kuo1', 'chua1', 'nong', 'se1', 'su3', 'gai2', 'ke', 'pie2', 'man', 'o3', 'pei3', 'jin', 'ran', 'ben', 'niang4', 'cun3', 'quan', 'xu', 'ping3', 'qiao3', 'niang1', 'nuan1', 'lin4', 'sun', 'lv', 'ping4', 'duan', 'pian', 'lia1', 'chuan', 'nou3', 'chi', 'bie', 'sao2', 'miu3', 'xiong4', 'miao', 'qiao', 'neng1', 'can', 'ruo2', 'chen1', 'cou3', 'hua3', 'jun2', 'nuo', 'diu2', 'shun2', 'chua4', 'xin2', 'cu', 'pin4', 'chen', 'yin', 'shai3', 'ca', 'shuang2', 'yue3', 'zen4', 'zen', 'ga2', 'tiao', 'chuo2', 'mu1', 'chuo3', 'piao', 'nin3', 'zhan', 'qia1', 'cen3', 'cao4', 'luan', 'cu3', 'cang', 'nuan4', 'heng3', 'jiong3', 'dun', 'jiang2', 'nang1', 'te1', 'kun', 'yao', 'nuan', 'nin

In [24]:
sub_fit_table = { key: list(value) for key, value in fit_table_v1.items() if key in append_v1_to_v2_tokens }
sub_fit_table = sort_by_freq(v1_ds, sub_fit_table)
sub_fit_table = select_first_n(sub_fit_table, 3)
sub_v1_ds = [ v1_ds[i] for i in flatten_select_result(sub_fit_table) ]
print_dataset(sub_v1_ds)

Data samples (length = 2): [
	('老师带我们参观了博物馆，讲解了许多古老的文物。', 'lao3 shi1 dai4 wo3 men can1 guan1 le bo2 wu4 guan3 ， jiang2 jie3 le xu3 duo1 gu3 lao3 de wen2 wu4 。'),
	('老师讲解了很多难题，大家都明白了。', 'lao3 shi1 jiang2 jie3 le hen3 duo1 nan2 ti2 ， da4 jia1 dou1 ming2 bai2 le 。'),
, ...]


In [None]:
save_v2_dataset("assets/text/mandarin_v2_noletters.txt", sub_v1_ds)

## 融合cross letters数据集

In [25]:
crossletters_ds = parse_v1_dataset("assets/text/crossletters.txt")
print_dataset(crossletters_ds)

Data samples (length = 21): [
	('从A到B的计划需要C和D的配合，以确保E可以成功执行。', 'cong2 A dao4 B de ji4 hua4 xu1 yao4 C he2 D de pei4 he2 ， yi3 que4 bao3 E ke2 yi3 cheng2 gong1 zhi2 xing2 。'),
	('你从F到G的过程中，H的支持是不可缺少的，尤其是在I阶段。', 'ni3 cong2 F dao4 G de guo4 cheng2 zhong1 ， H de zhi1 chi2 shi4 bu4 ke3 que1 shao3 de ， you2 qi2 shi4 zai4 I jie1 duan4 。'),
	('在项目进行中，J和K的合作将直接影响C的效率。', 'zai4 xiang4 mu4 jin4 xing2 zhong1 ， J he2 K de he2 zuo4 jiang1 zhi2 jie1 ying3 xiang3 C de xiao4 lv4 。'),
	('如果你想提高L到M的速度，N的调整将非常重要。', 'ru2 guo3 ni3 xiang3 ti2 gao1 L dao4 M de su4 du4 ， N de tiao2 zheng3 jiang1 fei1 chang2 zhong4 yao4 。'),
	('在这次活动中，O的出现为P带来了更大的机会。', 'zai4 zhe4 ci4 huo2 dong4 zhong1 ， O de chu1 xian4 wei4 P dai4 lai2 le geng4 da4 de ji1 hui4 。'),
, ...]


In [26]:
for i in range(len(crossletters_ds)):
    text = crossletters_ds[i][0]
    phome = crossletters_ds[i][1]
    newphome = []
    for x in phome.split(' '):
        if len(x) == 1 and ord('A') <= ord(x[0]) and ord(x[0]) <= ord('Z'):
            newphome.append(f'letter' + x)
        else:
            newphome.append(x)
    crossletters_ds[i] = (text, ' '.join(newphome))
print_dataset(crossletters_ds)

Data samples (length = 21): [
	('从A到B的计划需要C和D的配合，以确保E可以成功执行。', 'cong2 letterA dao4 letterB de ji4 hua4 xu1 yao4 letterC he2 letterD de pei4 he2 ， yi3 que4 bao3 letterE ke2 yi3 cheng2 gong1 zhi2 xing2 。'),
	('你从F到G的过程中，H的支持是不可缺少的，尤其是在I阶段。', 'ni3 cong2 letterF dao4 letterG de guo4 cheng2 zhong1 ， letterH de zhi1 chi2 shi4 bu4 ke3 que1 shao3 de ， you2 qi2 shi4 zai4 letterI jie1 duan4 。'),
	('在项目进行中，J和K的合作将直接影响C的效率。', 'zai4 xiang4 mu4 jin4 xing2 zhong1 ， letterJ he2 letterK de he2 zuo4 jiang1 zhi2 jie1 ying3 xiang3 letterC de xiao4 lv4 。'),
	('如果你想提高L到M的速度，N的调整将非常重要。', 'ru2 guo3 ni3 xiang3 ti2 gao1 letterL dao4 letterM de su4 du4 ， letterN de tiao2 zheng3 jiang1 fei1 chang2 zhong4 yao4 。'),
	('在这次活动中，O的出现为P带来了更大的机会。', 'zai4 zhe4 ci4 huo2 dong4 zhong1 ， letterO de chu1 xian4 wei4 letterP dai4 lai2 le geng4 da4 de ji1 hui4 。'),
, ...]


## 融合后再压缩

In [27]:
total_ds: typing.List[typing.Tuple[str, str]] = []
total_ds.extend(cleaned_ds)
total_ds.extend(sub_v1_ds)
total_ds.extend(crossletters_ds)
print_dataset(total_ds)

Data samples (length = 1061): [
	('进入秋季寡雨季节以来，泉州、漳州、莆田等沿海地区旱情明显。', 'jin4 ru4 qiu1 ji4 gua2 yu3 ji4 jie2 yi3 lai2 ， quan2 zhou1 、 zhang1 zhou1 、 pu2 tian2 deng3 yan2 hai3 di4 qu1 han4 qing2 ming2 xian3 。'),
	('在远处依稀可见的沙漠植物映衬下，整个球场显的绿意盎然。', 'zai4 yuan3 chu4 yi1 xi1 ke3 jian4 de sha1 mo4 zhi2 wu4 ying4 chen4 xia4 ， zheng3 ge qiu2 chang3 xian3 de lv4 yi4 ang4 ran2 。'),
	('由此，方体忠想到了姑奶张淑云，多次请其跟王文利“说说”。', 'you2 ci3 ， fang1 ti3 zhong1 xiang3 dao4 le gu1 nai3 zhang1 shu1 yun2 ， duo1 ci4 qing3 qi2 gen1 wang2 wen2 li4 “ shuo1 shuo ” 。'),
	('一路上，孙颖浩用自己的方式鼓舞着队员，其实他的心里也忐忑不安。', 'yi2 lu4 shang4 ， sun1 ying3 hao4 yong4 zi4 ji3 de fang1 shi4 gu2 wu3 zhe dui4 yuan2 ， qi2 shi2 ta1 de xin1 li3 ye2 tan3 te4 bu4 an1 。'),
	('写散文的人最多，人心却像他们的文章一样散，闹也闲不出气势。', 'xie2 san3 wen2 de ren2 zui4 duo1 ， ren2 xin1 que4 xiang4 ta1 men de wen2 zhang1 yi2 yang4 san3 ， nao4 ye3 xian2 bu4 chu1 qi4 shi4 。'),
, ...]


In [28]:
new_tokens_set = tokens_set.copy()
for i in range(ord('A'), ord('Z') + 1):
    new_tokens_set.add(f'letter' + chr(i)) # crossletters中新增的tokens
print(f"Token counts: {len(new_tokens_set)}")

Token counts: 2095


In [29]:
fit_table_final = fit_dataset_to_token_set(total_ds, new_tokens_set)
clean2_ds = { key: list(value) for key, value in fit_table_final.items() }
clean2_ds = sort_by_freq(total_ds, clean2_ds)
clean2_ds = select_first_n(clean2_ds, 3)
final_ds = [ total_ds[i] for i in flatten_select_result(clean2_ds) ]
print_dataset(final_ds)

Abandon tokens: set()
Data samples (length = 596): [
	('进入秋季寡雨季节以来，泉州、漳州、莆田等沿海地区旱情明显。', 'jin4 ru4 qiu1 ji4 gua2 yu3 ji4 jie2 yi3 lai2 ， quan2 zhou1 、 zhang1 zhou1 、 pu2 tian2 deng3 yan2 hai3 di4 qu1 han4 qing2 ming2 xian3 。'),
	('在远处依稀可见的沙漠植物映衬下，整个球场显的绿意盎然。', 'zai4 yuan3 chu4 yi1 xi1 ke3 jian4 de sha1 mo4 zhi2 wu4 ying4 chen4 xia4 ， zheng3 ge qiu2 chang3 xian3 de lv4 yi4 ang4 ran2 。'),
	('同时，义工们也正在筹备经费购买御寒衣物派送给孤寡老人。', 'tong2 shi2 ， yi4 gong1 men ye3 zheng4 zai4 chou2 bei4 jing1 fei4 gou4 mai3 yu4 han2 yi1 wu4 pai4 song4 gei3 gu1 gua2 lao3 ren2 。'),
	('但如果按车队规模，一嗨数千辆车的量级绝对算不上最大。', 'dan4 ru2 guo3 an4 che1 dui4 gui1 mo2 ， yi4 hai1 shu4 qian1 liang4 che1 de liang4 ji2 jue2 dui4 suan4 bu2 shang4 zui4 da4 。'),
	('回到家林妈不住催问，他说还可以，林母拍腿而起：“你说可以就是不好！', 'hui2 dao4 jia1 lin2 ma1 bu2 zhu4 cui1 wen4 ， ta1 shuo1 hai2 ke2 yi3 ， lin2 mu3 pai1 tui3 er2 qi3 ： “ ni3 shuo1 ke2 yi3 jiu4 shi4 bu4 hao3 ！'),
, ...]


In [30]:
final_ds_set = set(list(analysis_token_frequency(final_ds).keys()))
print(f"Token converage: {len(new_tokens_set & final_ds_set)}")

Token converage: 1407


In [33]:
save_v2_dataset("assets/text/mandarin_v2_train_withletters.txt", final_ds)

## 制作验证集

In [5]:
final_ds = parse_v2_dataset("assets/text/mandarin_v2_train_withletters.txt")

In [7]:
train_texts: typing.Set[str] = set()
for (text, phome) in final_ds:
    train_texts.add(text)
select_range: typing.List[int] = []
for i, (text, phome) in enumerate(bznsyp_ds):
    if text not in train_texts:
        select_range.append(i)
print(f"Selective range: {len(select_range)}")

Selective range: 9423


In [12]:
select_idx = random.sample(select_range, 30)
valid_ds = [ bznsyp_ds[i] for i in select_idx ]
valid_ds = add_extra_character_to_phomes(valid_ds)
print_dataset(valid_ds)

Data samples (length = 30): [
	('今年在外包装上还首次有了防伪记号。', 'jin1 nian2 zai4 wai4 bao1 zhuang1 shang4 hai2 shou3 ci4 you3 le fang2 wei3 ji4 hao4 。'),
	('既想让马儿跑，又不给马儿足够的草，其结果也就在意料之中。', 'ji4 xiang3 rang4 ma3 er2 pao3 ， you4 bu4 gei2 ma3 er2 zu2 gou4 de cao3 ， qi2 jie2 guo2 ye3 jiu4 zai4 yi4 liao4 zhi1 zhong1 。'),
	('最后，张金中在漆黑的夜色下涮洗。', 'zui4 hou4 ， zhang1 jin1 zhong1 zai4 qi1 hei1 de ye4 se4 xia4 shuan4 xi3 。'),
	('赵云的朋友郝超英建议先带董静去海口躲躲。', 'zhao4 yun2 de peng2 you hao3 chao1 ying1 jian4 yi4 xian1 dai4 dong3 jing4 qu4 hai2 kou3 duo2 duo3 。'),
	('救援队正在加快安装固定泵，通过抽水稳定水位。', 'jiu4 yuan2 dui4 zheng4 zai4 jia1 kuai4 an1 zhuang1 gu4 ding4 beng4 ， tong1 guo4 chou1 shui2 wen3 ding4 shui3 wei4 。'),
, ...]


In [15]:
save_v2_dataset("assets/text/mandarin_v2_valid.txt", valid_ds)

## 生成tokens文件

In [4]:
from utils.tokens import generate_token_list
token_filename = os.path.join("assets/text/mandarin_v2_train_withletters_tokens.txt")
with open(token_filename, "w", encoding="utf-8") as f:
	tokens = generate_token_list()
	for indx, token in enumerate(tokens):
		f.write(f"{token} {indx}\n")
	cnt = len(tokens)
	for i in range(ord('A'), ord('Z') + 1):
		f.write(f'letter{chr(i)} {cnt}\n')
		cnt += 1
print(f"Tokens saved to {token_filename}")

Tokens saved to assets/text/mandarin_v2_train_withletters_tokens.txt
