### Create a dataframe containing pinyin of each character

In [7]:
import pandas as pd

with open('Unihan_Readings.txt') as file:
    lines = file.read().splitlines()

languages = ['kHanyuPinlu', 'kHanyuPinyin',
             'kMandarin', 'kTGHZ2013', 'kXHC1983']
cols = ['Unicode', 'Char', 'Language', 'Pinyin']

pron_list = [line.strip().split('\t') for line in lines]
pron_list = [[item.strip() for item in line] for line in pron_list]
pron_list = [line for line in pron_list if line[1] in languages]
pron_list = [[line[0],chr(eval('0x'+line[0][2:])), line[1], line[2]] for line in pron_list]

df = pd.DataFrame(pron_list,columns=cols)
#drop duplicate characters

df.sample(10)

Unnamed: 0,Unicode,Char,Language,Pinyin
73470,U+21E03,𡸃,kMandarin,xiǎn
93745,U+2939C,𩎜,kMandarin,pēi
5780,U+42BE,䊾,kHanyuPinyin,53369.080:mù
80281,U+24B37,𤬷,kMandarin,nà
58881,U+8FDE,连,kTGHZ2013,219.020:lián
79413,U+24639,𤘹,kMandarin,pí
39342,U+746D,瑭,kXHC1983,1118.020:táng
52283,U+8655,處,kMandarin,chù
7823,U+478B,䞋,kMandarin,chèn
37582,U+7213,爓,kMandarin,yàn


### Prune the Pinyin column to only contain pronunciations

In [8]:
def map_to_dict1(pinyin):
    pin_dict = {pinyin:1}
    return pin_dict

def map_to_dict2(pinyin):
    pins = pinyin.split(':')[1]
    pins = pins.split(',')
    pin_dict = dict.fromkeys(pins,1)
    return pin_dict

def map_to_dict3(pinyin):
    dirty_pins = pinyin.split(' ')
    clean_pins = [pin.split(':')[1] for pin in dirty_pins]
    pin_dict = dict.fromkeys(clean_pins,1)
    return pin_dict

def pinlu(pinyin):
    pinlus = pinyin.split(',')
    pinlu_dict = {pinlu.split('(')[0]: pinlu.split('(')[1].strip(')') for pinlu in pinlus}
    return pinlu_dict

df.loc[(df['Language'] == 'kMandarin'), 'Pinyin'] = df.loc[(df['Language'] == 'kMandarin')]['Pinyin'].apply(map_to_dict1)
df.loc[(df['Language'] == 'kHanyuPinyin'), 'Pinyin'] = df.loc[(df['Language'] == 'kHanyuPinyin')]['Pinyin'].apply(map_to_dict2)
df.loc[(df['Language'] == 'kTGHZ2013'), 'Pinyin'] = df.loc[(df['Language'] == 'kTGHZ2013')]['Pinyin'].apply(map_to_dict3)
df.loc[(df['Language'] == 'kXHC1983'), 'Pinyin'] = df.loc[(df['Language'] == 'kXHC1983')]['Pinyin'].apply(map_to_dict3)
df.loc[(df['Language'] == 'kHanyuPinlu'), 'Pinyin'] = df.loc[(df['Language'] == 'kHanyuPinlu')]['Pinyin'].apply(pinlu)

df.sample(10)

Unnamed: 0,Unicode,Char,Language,Pinyin
55113,U+8A69,詩,kHanyuPinlu,{'shī': '152'}
84600,U+26369,𦍩,kXHC1983,{'gǔ': 1}
5426,U+41F2,䇲,kMandarin,{'cè': 1}
79068,U+2438B,𤎋,kMandarin,{'cuì': 1}
51877,U+85B1,薱,kMandarin,{'duì': 1}
93625,U+29322,𩌢,kMandarin,{'suō': 1}
21575,U+5C6F,屯,kXHC1983,"{'tún': 1, 'zhūn': 1}"
50294,U+836F,药,kHanyuPinlu,{'yào': '407'}
62904,U+95ED,闭,kTGHZ2013,{'bì': 1}
27980,U+64DA,據,kHanyuPinlu,{'jù': '805'}


### Drop duplicate character entries

In [9]:
df_cleaned = df.drop_duplicates(subset = 'Char',keep='first')

### Export table to file

In [10]:
df_cleaned.to_csv('Character_Pinyin_Table.tsv', sep='\t', index=False)

### Import Character_Pinyin Table

In [11]:
table = pd.read_csv('Character_Pinyin_Table.tsv', sep='\t').set_index('Unicode')
table.loc[table['Char'] == '然']

Unnamed: 0_level_0,Char,Language,Pinyin
Unicode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
U+7136,然,kHanyuPinlu,{'rán': '3549) ran'}


### Add pinyin to a text one character at a time

In [13]:
def add_pin_1char(text):
    text_pin = ""
   
    for i in text: 
        if i in table['Char'].values:
            char = table.loc[(table['Char'] == i)]
            pron = char['Pinyin'][0].split(':')[0].strip("{'")
            word_pin = i + "(" + pron + ")"
            text_pin += word_pin
        else:
            word_pin = i
            text_pin += word_pin
    return text_pin

# pass a text to the function
with open('Test.txt') as f:
    text = f.read()

print(add_pin_1char(text))

冰(bīng)雪(xuě)运(yùn)动(dòng)蔚(wèi)然(rán)成(chéng)风(fēng)，冬(dōng)奥(ào)助(zhù)推(tuī)习(xí)近(jìn)平(píng)体(tǐ)育(yù)强(qiáng)国(guó)梦(mèng)
AMY QIN
2022年(nián)2月(yuè)16日(rì)
在(zài)北(běi)京(jīng)的(de)一(yī)个(gè)公(gōng)园(yuán)上(shàng)速(sù)滑(huá)课(kè)，摄(shè)于(yú)上(shàng)月(yuè)。冰(bīng)雪(xuě)运(yùn)动(dòng)热(rè)潮(cháo)正(zhèng)在(zài)中(zhōng)国(guó)出(chū)现(xiàn)。
在(zài)北(běi)京(jīng)的(de)一(yī)个(gè)公(gōng)园(yuán)上(shàng)速(sù)滑(huá)课(kè)，摄(shè)于(yú)上(shàng)月(yuè)。冰(bīng)雪(xuě)运(yùn)动(dòng)热(rè)潮(cháo)正(zhèng)在(zài)中(zhōng)国(guó)出(chū)现(xiàn)。 KEVIN FRAYER/GETTY IMAGES
北(běi)京(jīng)——在(zài)中(zhōng)国(guó)南(nán)方(fāng)城(chéng)市(shì)广(guǎng)州(zhōu)，虽(suī)然(rán)一(yī)年(nián)中(zhōng)大(dà)部(bù)分(fēn)时(shí)候(hou)天(tiān)气(qì)闷(mèn)热(rè)，但(dàn)儿(r)童(tóng)们(men)正(zhèng)在(zài)甩(shuǎi)掉(diào)拖(tuō)鞋(xié)，穿(chuān)上(shàng)滑(huá)雪(xuě)板(bǎn)，在(zài)室(shì)内(nèi)雪(xuě)坡(pō)上(shàng)滑(huá)雪(xuě)。
远(yuǎn)在(zài)西(xi)部(bù)青(qīng)藏(cáng)高(gāo)原(yuán)的(de)青(qīng)海(hǎi)省(shěng)出(chū)人(rén)意(yì)料(liào)地(de)成(chéng)了(le)冰(bīng)壶(hú)运(yùn)动(d

### Import a phrase dictionary

In [14]:
phrase = pd.read_csv('phrase.txt', sep='=',names=['Char','Pron'])
phrase.sample(10)

Unnamed: 0,Char,Pron
27199,好客,"hào,kè"
2823,东北,"dōng,běi"
47370,无线电接收机,"wú,xiàn,diàn,jiē,shōu,jī"
15980,卤代烃,"lǔ,dài,tīng"
20359,和蔼,"hé,ǎi"
3830,丰姿,"fēng,zī"
70933,纵梁,"zòng,liáng"
22052,回馈,"huí,kuì"
68421,稻草人,"dào,cǎo,rén"
3725,中纪委,"zhōng,jì,wěi"


### Use the phrase dictionary to add pinyin to chracters two at a time

In [15]:
phrase = phrase.set_index('Char')

In [16]:
# define function to add pinyin to all characters in a text, two characters at a time
def add_pin_2char(text):
    text_pin = ""
    
    i=0
    while i < len(text)-1:
        if text[i:i+2] in phrase.index: 
            char = phrase.loc[text[i:i+2]]
            pron = char['Pron'].split(",")
            word_pin = text[i] + '(' + pron[0] + ')' + text[i+1] + '(' + pron[1] + ')' 
            text_pin += word_pin
            i += 2
        elif text[i] in table['Char'].values:
            char = table.loc['U+'+hex(ord(text[i])).upper()[2:]]
            pron = list(eval(char['Pinyin']).keys())[0]
            word_pin = text[i] + "(" + pron + ")"
            text_pin += word_pin
            i += 1
        else:
            word_pin = text[i]
            text_pin += word_pin
            i += 1

    return text_pin

# pass a text to the function
with open('Baidu.txt') as f:
    text = f.read()

print(add_pin_2char(text))

中(zhōng)国(guó)历(lì)史(shǐ)上(shàng)的(de)重(zhòng)大(dà)事(shì)件(jiàn)，你(nǐ)知(zhī)道(dào)几(jǐ)件(jiàn)？

如(rú)果(guǒ)直(zhí)留(liú)下(xià)妹(mèi)

发(fā)布(bù)时(shí)间(jiān): 2021-12-02 16:51
关(guān)注(zhù)
1、秦(qín)始(shǐ)皇(huáng)统(tǒng)一(yī)中(zhōng)国(guó)

秦(qín)始(shǐ)皇(huáng)是(shì)出(chū)生(shēng)于(yú)赵(zhào)国(guó)都(dū)城(chéng)邯(hán)郸(dān)，前(qián)247年(nián)，13岁(suì)时(shí)即(jí)王(wáng)位(wèi)。22岁(suì)时(shí)，开(kāi)始(shǐ)“亲(qīn)理(lǐ)朝(cháo)政(zhèng)”，自(zì)前(qián)230年(nián)至(zhì)前(qián)221年(nián)，先(xiān)后(hòu)灭(miè)韩(hán)、赵(zhào)、魏(wèi)、楚(chu)、燕(yàn)、齐(qí)六(liù)国(guó)，39岁(suì)时(shí)完(wán)成(chéng)了(le)统(tǒng)一(yī)中(zhōng)国(guó)大(dà)业(yè)，建(jiàn)立(lì)起(qǐ)一(yī)个(gè)以(yǐ)汉(hàn)族(zú)为(wéi)主(zhǔ)体(tǐ)统(tǒng)一(yī)的(de)中(zhōng)央(yāng)集(jí)权(quán)的(de)强(qiáng)大(dà)国(guó)家(jiā)——秦(qín)朝(cháo)，并(bìng)奠(diàn)定(dìng)中(zhōng)国(guó)本(běn)土(tǔ)的(de)疆(jiāng)域(yù)。秦(qín)王(wáng)政(zhèng)灭(miè)六(liù)国(guó)后(hòu)，认(rèn)为(wéi)自(zì)己(jǐ)“德(dé)兼(jiān)三(sān)皇(huáng)，功(gōng)过(guò)五(wǔ)帝(dì)”，遂(suì)采(cǎi)用(yòng)三(sān)皇(huáng)之(zhī)“皇(hu