In [1]:
import os, re, typing, jieba, pypinyin

In [2]:
dataset: typing.List[typing.Tuple[str, str]] = []
with open("assets/text/full_mandarin.txt", "r", encoding="utf-8") as f:
	lines = f.readlines()
for i in range(0, len(lines), 2):
	text = lines[i].strip().strip('\n')
	phome = lines[i + 1].strip().strip('\n')
	dataset.append((text, phome))
print(f"len: {len(dataset)}")

len: 1414


In [3]:
dataset[:5]

[('躺在急救担架上的男子双目紧闭，头发散发出一股烧焦的味道。',
  'tang3 zai4 ji2 jiu4 dan1 jia4 shang4 de5 nan2 zi3 shuang1 mu4 jin3 bi4 tou2 fa4 san4 fa1 chu1 yi4 gu3 shao1 jiao1 de5 wei4 dao4'),
 ('工业园区是承接产业转移、加速产业集聚、培育产业集群的主要载体。',
  'gong1 ye4 yuan2 qu1 shi4 cheng2 jie1 chan3 ye4 zhuan3 yi2 jia1 su4 chan3 ye4 ji2 ju4 pei2 yu4 chan3 ye4 ji2 qun2 de5 zhu3 yao4 zai4 ti3'),
 ('那一刻，我才真正的懂你，就像懂我现在的自己。',
  'na4 yi2 ke4 wo3 cai2 zhen1 zheng4 de5 dong2 ni3 jiu4 xiang4 dong2 wo3 xian4 zai4 de5 zi4 ji3'),
 ('由于列车长时间停靠，车厢内的空气越来越“闷”。',
  'you2 yu2 lie4 che1 zhang3 shi2 jian1 ting2 kao4 che1 xiang1 nei4 de5 kong1 qi4 yue4 lai2 yue4 men1'),
 ('但如果按车队规模，一嗨数千辆车的量级绝对算不上最大。',
  'dan4 ru2 guo3 an4 che1 dui4 gui1 mo2 yi4 hai1 shu4 qian1 liang4 che1 de5 liang4 ji2 jue2 dui4 suan4 bu2 shang4 zui4 da4')]

In [4]:
token_filename = "assets/dataset/tokens.txt"
token_set: typing.Set[str] = set()
with open(token_filename, 'r', encoding='utf-8') as f:
	for line in f.readlines():
		token = line.strip('\n').strip().split(' ')[0]
		token_set.add(token)
print(f"Got token set {len(token_set)}")

Got token set 2069


In [5]:
maps: typing.Dict[str, typing.Set[int]] = { x: set() for x in token_set } # { token: [line number] }
abandon_token: typing.Set[str] = set()
for i, (text, phome) in enumerate(dataset):
	for token in phome.split(' '):
		if token not in token_set:
			abandon_token.add(token)
		else:
			maps[token].add(i)
print(f"Abandon tokens:", abandon_token)

Abandon tokens: {'cha5', 'wan5', 'ger4', 'yanr2', 'da5', 'ren5', 'ye5', 'xi5', 'nang5', 'huan5', 'teng5', 'duo5', 'de5', 'yir4', 'ling5', 'shuan5', 'lai5', 'wa5', 'hou5', 'qin5', 'xie5', 'yang5', 'sha5', 'hu5', 'pi5', 'zi5', 'tou5', 'ji5', 'bian5', 'sheng5', 'ju5', 'die5', 'ya5', 'nai5', 'hair2', 'dou5', 'ter4', 'jiu5', 'qing5', 'di5', 'he5', 'guo5', 'tang5', 'ng1', 'wu5', 'shuo5', 'mei5', 'kan5', 'jinr4', 'bu5', 'heng5', 'kou5', 'you5', 'gu5', 'cao5', 'pa5', 'dianr3', 'peng5', 'qu5', 'qiu5', 'ne5', 'chan5', 'kuair4', 'mao5', 'ba5', 'fa5', 'ying5', 'tun5', 'er5', 'yong5', 'pai5', 'na5', 'fang5', 'lan5', 'luo5', 'hao5', 'tuo5', 'sou5', 'niang5', 'jie5', 'liang5', 'cai5', 'la5', 'sa5', 'xing5', 'zhi5', 'suo5', 'menr5', 'se5', 'shang5', 'tao5', 'long5', 'rang5', 'zhou5', 'gua5', 'nan5', 'bao5', 'bai5', 'sao5', 'ge5', 'jia5', 'le5', 'po5', 'kuai5', 'ha5', 'lu5', 'yi5', 'hua5', 'qie5', 'ti5', 'tan5', 'jun5', 'zha5', 'huir4', 'mi5', 'pan5', 'dao5', 'nar4', 'ma5', 'zu5', 'su5', 'tanr1', 'zhe5

In [6]:
scores = [ 0 for _ in range(len(dataset))]
for i, (text, phome) in enumerate(dataset):
	for token in phome.split(' '):
		if token in maps:
			if len(maps[token]) == 1:
				scores[i] += 100
			elif len(maps[token]) == 2:
				scores[i] += 1
scores[-5:]

[2, 1, 100, 100, 100]

In [7]:
agresive_maps: typing.Dict[str, typing.List[int]] = {} # token: [line number]
for (key, value) in maps.items():
	line_list = list(value)
	line_list.sort(key=lambda x: scores[x], reverse=True)
	agresive_maps[key] = line_list

In [9]:
n_select = 3
select_set: typing.Set[int] = set()
for key, line_numbers in agresive_maps.items():
	for i in line_numbers[:n_select]:
		select_set.add(i)
outputs = [ dataset[i] for i in select_set]
print(f"Got outputs: {len(outputs)}")

Got outputs: 1205


In [10]:
outputs[:5]

[('躺在急救担架上的男子双目紧闭，头发散发出一股烧焦的味道。',
  'tang3 zai4 ji2 jiu4 dan1 jia4 shang4 de5 nan2 zi3 shuang1 mu4 jin3 bi4 tou2 fa4 san4 fa1 chu1 yi4 gu3 shao1 jiao1 de5 wei4 dao4'),
 ('工业园区是承接产业转移、加速产业集聚、培育产业集群的主要载体。',
  'gong1 ye4 yuan2 qu1 shi4 cheng2 jie1 chan3 ye4 zhuan3 yi2 jia1 su4 chan3 ye4 ji2 ju4 pei2 yu4 chan3 ye4 ji2 qun2 de5 zhu3 yao4 zai4 ti3'),
 ('那一刻，我才真正的懂你，就像懂我现在的自己。',
  'na4 yi2 ke4 wo3 cai2 zhen1 zheng4 de5 dong2 ni3 jiu4 xiang4 dong2 wo3 xian4 zai4 de5 zi4 ji3'),
 ('由于列车长时间停靠，车厢内的空气越来越“闷”。',
  'you2 yu2 lie4 che1 zhang3 shi2 jian1 ting2 kao4 che1 xiang1 nei4 de5 kong1 qi4 yue4 lai2 yue4 men1'),
 ('但如果按车队规模，一嗨数千辆车的量级绝对算不上最大。',
  'dan4 ru2 guo3 an4 che1 dui4 gui1 mo2 yi4 hai1 shu4 qian1 liang4 che1 de5 liang4 ji2 jue2 dui4 suan4 bu2 shang4 zui4 da4')]

In [14]:
output_set: typing.Set[str] = set()
for text, phomes in outputs:
	for phome in phomes.split(' '):
		output_set.add(phome)
print(f"converage: {len(token_set) - len(token_set - output_set)} / {len(token_set)}")
print(f"dataset: {len(outputs)}")

converage: 1312 / 2069
dataset: 1205


In [15]:
dataset_set: typing.Set[str] = set()
for text, phomes in outputs:
	for phome in phomes.split(' '):
		dataset_set.add(phome)
print(f"converage: {len(token_set) - len(token_set - dataset_set)} / {len(token_set)}")
print(f"dataset: {len(dataset)}")

converage: 1312 / 2069
dataset: 1414


In [None]:
with open("assets/text/mandarin_train.txt", "w", encoding="utf-8") as f:
	for text, phome in outputs:
		f.write(f"{text}\n{phome}\n")
print("done.")

done.


In [23]:
unselected_set = set(range(len(dataset))) - select_set
unselected_dataset = [ dataset[i] for i in unselected_set ]

In [24]:
select_range = len(unselected_set) // 4
select_range

52

In [22]:
import random

In [26]:
valid_dataset = random.sample(unselected_dataset, k=select_range)
valid_dataset[:5]

[('他决定利用业余时间宣传脆骨病，让大家关注脆弱的“瓷娃娃”。',
  'ta1 jue2 ding4 li4 yong4 ye4 yu2 shi2 jian1 xuan1 chuan2 cui4 gu3 bing4 rang4 da4 jia1 guan1 zhu4 cui4 ruo4 de5 ci2 wa2 wa5'),
 ('此间，内陆各省为了争上内陆第一核电站而拼得头破血流。',
  'ci3 jian1 nei4 lu4 ge4 sheng3 wei4 le5 zheng1 shang4 nei4 lu4 di4 yi1 he2 dian4 zhan4 er2 pin1 de5 tou2 po4 xue4 liu2'),
 ('最后，他的目光停留在那几个大字上：南无阿弥陀佛。',
  'zui4 hou4 ta1 de5 mu4 guang1 ting2 liu2 zai4 na4 ji3 ge5 da4 zi4 shang4 nan2 wu2 e1 mi5 tuo2 fo2'),
 ('早上在楼下汤粉店吃早餐，碰见我们领导，并热情的打招呼。',
  'zao3 shang4 zai4 lou2 xia4 tang1 fen3 dian4 chi1 zao3 can1 peng4 jian4 wo3 men5 ling2 dao3 bing4 re4 qing2 de5 da3 zhao1 hu5'),
 ('只要不引起笑场，快男、超女式的造型唱京剧也无妨，观众才是上帝。',
  'zhi3 yao4 bu4 yin2 qi3 xiao4 chang3 kuai4 nan2 chao1 nv3 shi4 de5 zao4 xing2 chang4 jing1 ju4 ye3 wu2 fang2 guan1 zhong4 cai2 shi4 shang4 di4')]

In [27]:
with open("assets/text/mandarin_valid.txt", "w", encoding="utf-8") as f:
	for text, phome in valid_dataset:
		f.write(f"{text}\n{phome}\n")
print("done.")

done.
