## Pinyin Transformation

In [23]:
import pandas as pd
import numpy as np
from pypinyin import pinyin, lazy_pinyin, Style
from random import randint

In [603]:
pinyin('中心', heteronym=True)

[['zhōng', 'zhòng'], ['xīn']]

In [604]:
people = pd.read_excel('1_answers.xlsx', encoding='UTF-8-sig')

In [606]:
people.describe()

Unnamed: 0,en,ch
count,58320,58320
unique,58320,42924
top,Krey,莫尔
freq,1,18


In [607]:
lei1 = pinyin('勒', heteronym='True')[0][0]
le4 = pinyin('勒', heteronym='True')[0][1]
print(lei1, le4)

lēi lè


In [608]:
rl = [randint(0, len(people)) for i in range(30)]
for index, dp in people.iloc[rl].iterrows():
    print(dp['en'], dp['ch'], pinyin(dp['ch'], errors='ignore'))

Arminel 阿米妮尔 [['ā'], ['mǐ'], ['nī'], ['ěr']]
Brank 布兰克 [['bù'], ['lán'], ['kè']]
Hutchings 哈钦斯 [['hā'], ['qīn'], ['sī']]
Nightingale 奈廷格尔 [['nài'], ['tíng'], ['gé'], ['ěr']]
Thapar 撒帕尔 [['sā'], ['pà'], ['ěr']]
Graue 格劳厄 [['gé'], ['láo'], ['è']]
Sigerith 西格里思 [['xī'], ['gé'], ['lǐ'], ['sī']]
Schmidlapp 施米德拉普 [['shī'], ['mǐ'], ['dé'], ['lā'], ['pǔ']]
Robie 罗比 [['luó'], ['bǐ']]
Wahlquist 瓦尔奎斯特 [['wǎ'], ['ěr'], ['kuí'], ['sī'], ['tè']]
Hinsley 欣斯利 [['xīn'], ['sī'], ['lì']]
Fonda 方达 [['fāng'], ['dá']]
Ritchford 里奇福德 [['lǐ'], ['qí'], ['fú'], ['dé']]
Vezian 韦齐恩 [['wéi'], ['qí'], ['ēn']]
Kunsemiller 孔斯米勒 [['kǒng'], ['sī'], ['mǐ'], ['lēi']]
Edey 埃迪 [['āi'], ['dí']]
Brilliant 布里连特 [['bù'], ['lǐ'], ['lián'], ['tè']]
Eudaly 尤德利 [['yóu'], ['dé'], ['lì']]
Griesemer 格里塞默 [['gé'], ['lǐ'], ['sāi'], ['mò']]
Dearmore 迪尔莫尔 [['dí'], ['ěr'], ['mò'], ['ěr']]
Rindskopf 林德斯科普夫 [['lín'], ['dé'], ['sī'], ['kē'], ['pǔ'], ['fū']]
Starey 斯塔里 [['sī'], ['tǎ'], ['lǐ']]
Boyse 博伊斯 [['bó'], ['yī'], ['sī']]
Scobey 斯科比 [['

In [610]:
for index, dp in people.iterrows():
    assert len(pinyin(dp['ch'], errors='ignore')) == len(dp['ch'])
    if not pinyin(dp['ch'], errors='ignore'):
        print(dp['en'], dp['ch'], pinyin(dp['ch'], errors='ignore'))

所有字都有拼音！

In [609]:
pys = []
for index, dp in people.iterrows():
    py = ' '.join([p[0] for p in pinyin(dp['ch'], errors='ignore')])
    if '勒' in dp['ch']:
        py = py.replace(lei1, le4)
    pys.append(py)
        

In [611]:
people['pinyin'] = pys

In [612]:
people.describe()

Unnamed: 0,en,ch,pinyin
count,58320,58320,58320
unique,58320,42924,42052
top,Krey,莫尔,xī ěr
freq,1,18,25


寻找跟"勒"一样可能出问题的字, 起码要防止高频字出问题。

In [613]:
ch_dict = dict()
for index, dp in people.iterrows():
    ch = dp['ch']
    for c in ch:
        ch_dict.setdefault(c, 0)
        ch_dict[c] += 1

In [614]:
len(ch_dict.keys())

445

In [615]:
ch_dict = sorted(ch_dict.items(), key=lambda item: item[1], reverse=True)

In [618]:
ch_chars = [ch[0] for ch in ch_dict]

inst_445 = pd.DataFrame(columns=['en', 'ch', 'char', 'pinyin'])
def find_inst(pointer):
    global inst_445
    if len(inst_445) == 445:
        return
    
    for index, dp in people.iterrows():
        char = ch_chars[pointer]
        if char in dp['ch']:
            s = '[' + str(pointer) + ']'
            print(s, char)
            inst_445.loc[pointer] = [dp['en'], dp['ch'], char, dp['pinyin']]
            break
    pointer += 1
    return find_inst(pointer)

find_inst(0)

In [498]:
inst_445 = pd.DataFrame(columns=['en', 'ch', 'char', 'pinyin'])
inst_445.loc[0] = [1,2,3,4]

In [503]:
inst_445.to_excel('instance_445.xlsx', index=False)

In [223]:
# Selecting according to criterion
# people.loc[people.index.map(lambda ind: '勒' in people.loc[ind]['ch'])] 

In [619]:
sai4 = pinyin('塞', heteronym=True)[0][2]
sai1 = pinyin('塞', heteronym=True)[0][0]
print(sai1, sai4)

for index, dp in people.iterrows():
    py = dp['pinyin']
    if '塞' in dp['ch']:
        py = py.replace(sai1, sai4)
    dp['pinyin'] = py

sāi sài


In [620]:
shen2 = pinyin('什', heteronym=True)[0][0]
shi2 = pinyin('什', heteronym=True)[0][1]
print(shen2, shi2)

for index, dp in people.iterrows():
    py = dp['pinyin']
    if '什' in dp['ch']:
        py = py.replace(shen2, shi2)
    dp['pinyin'] = py

shén shí


In [621]:
ye2 = pinyin('耶', heteronym=True)[0][0]
ye1 = pinyin('耶', heteronym=True)[0][2]
print(ye2, ye1)

for index, dp in people.iterrows():
    py = dp['pinyin']
    if '耶' in dp['ch']:
        py = py.replace(ye2, ye1)
    dp['pinyin'] = py

yé yē


In [622]:
xiao4 = pinyin('肖', heteronym=True)[0][0]
xiao1 = pinyin('肖', heteronym=True)[0][1]
print(xiao4, xiao1)

for index, dp in people.iterrows():
    py = dp['pinyin']
    if '肖' in dp['ch']:
        py = py.replace(xiao4, xiao1)
    dp['pinyin'] = py

xiào xiāo


In [623]:
ceng2 = pinyin('曾', heteronym=True)[0][0]
zeng1 = pinyin('曾', heteronym=True)[0][1]
print(ceng2, zeng1)

for index, dp in people.iterrows():
    py = dp['pinyin']
    if '曾' in dp['ch']:
        py = py.replace(ceng2, zeng1)
    dp['pinyin'] = py

céng zēng


In [624]:
fu2 = pinyin('佛', heteronym=True)[0][0]
fo1 = pinyin('佛', heteronym=True)[0][3]
print(fu2, fo1)

for index, dp in people.iterrows():
    py = dp['pinyin']
    if '佛' in dp['ch']:
        py = py.replace(fu2, fo1)
    dp['pinyin'] = py

fú fó


In [625]:
xing4 = pinyin('兴', heteronym=True)[0][0]
xing1 = pinyin('兴', heteronym=True)[0][1]
print(xing4, xing1)

for index, dp in people.iterrows():
    py = dp['pinyin']
    if '兴' in dp['ch']:
        py = py.replace(xing4, xing1)
    dp['pinyin'] = py

xìng xīng


In [634]:
people.loc[people.index.map(lambda ind: '柏' in people.loc[ind]['ch'])]

Unnamed: 0,en,ch,pinyin
27292,Homburg,杭柏格,háng bǎi gé


In [633]:
people.iloc[21885]['pinyin'] = people.iloc[21885]['pinyin'].replace(pinyin('藏', heteronym=True)[0][0], pinyin('藏', heteronym=True)[0][1])

In [635]:
people.iloc[27292]['pinyin'] = people.iloc[27292]['pinyin'].replace(pinyin('柏', heteronym=True)[0][0], pinyin('柏', heteronym=True)[0][1])

In [546]:
pinyin('晤', heteronym=True)

[['wù']]

In [549]:
lazy_pinyin('保罗什')

['bao', 'luo', 'shen']

In [637]:
pys = []
for index, dp in people.iterrows():
    assert len(lazy_pinyin(dp['ch'])) == len(dp['ch'])
    py = ' '.join(lazy_pinyin(dp['ch'], errors='ignore'))
    pys.append(py)

In [638]:
people['pinyin_no_tone'] = pys

errs = ['勒', '塞', '什', '耶', '曾', '佛', '兴', '藏', '柏', '肖']

In [652]:
for index, dp in people.iterrows():
    py = dp['pinyin_no_tone']
    if '勒' in dp['ch']:
        py = py.replace('lei', 'le')
    dp['pinyin_no_tone'] = py

In [655]:
people.loc[people.index.map(lambda ind: '耶' in people.loc[ind]['ch'])]

Unnamed: 0,en,ch,pinyin,pinyin_no_tone
193,Achtemeier,阿赫特迈耶,ā hè tè mài yē,a he te mai ye
510,Ahier,阿耶,ā yē,a ye
988,Allgaier,奥尔盖耶,ào ěr gài yē,ao er gai ye
989,Allgeier,奥尔盖耶,ào ěr gài yē,ao er gai ye
1135,Altemeier,阿尔特迈耶,ā ěr tè mài yē,a er te mai ye
1159,Altmeyer,奥特迈耶,ào tè mài yē,ao te mai ye
1549,Anglemyer,安格尔迈耶,ān gé ěr mài yē,an ge er mai ye
2189,Aschemeyer,阿谢迈耶,ā xiè mài yē,a xie mai ye
2382,Atiyah,阿提耶,ā tí yē,a ti ye
2383,Atiyeh,阿提耶,ā tí yē,a ti ye


In [658]:
people.describe().to_excel('summary.xlsx')

In [659]:
people.to_excel('people.xlsx', encoding='UTF-8', index=False)