In [2]:
import fileinput
import os
import numpy as np
from tqdm.notebook import tqdm
import pickle


In [4]:
# number the characters
num_dict = {}
num = 0
with fileinput.input(files='ref/charlist.txt',
                     openhook=fileinput.hook_encoded('gbk')) as f:
    for line in f: # only one line
        for char in line:
            num_dict[char] = num
            num += 1
print('All {} chars'.format(num))
print(num_dict['中'])
print(num_dict['国'])
print(num_dict['家'])


All 6763 chars
3619
935
1177


In [5]:
# map words to characters' numbers
map_word_num = {}
with fileinput.input(files='ref/word-chars.txt',
                     openhook=fileinput.hook_encoded('gbk')) as f:
    for line in f:
        word, chars = line.split()[0], line.split()[1:]
        num = [num_dict[char] for char in chars]
        map_word_num[word] = num

print(map_word_num['cao'])

with open('data/map_word_num', 'wb') as f:
    pickle.dump(map_word_num, f)

[244, 4960, 247, 4149, 248, 4554, 5064, 245, 6415, 6301, 246]


In [4]:
# count characters and tuples (time-hungry)
mesh_count = np.zeros(shape=(6763, 6763), dtype=np.int32)
char_count = np.zeros(shape=(6763), dtype=np.int32)
res_files = os.listdir('ref/corpus')
res_files = [os.path.join('ref/corpus', x) for x in res_files]
print('Resource files:', res_files)
with fileinput.input(files=res_files,
                     openhook=fileinput.hook_encoded('utf-8', 'ignore')) as f:
    for line in f:
        if fileinput.isfirstline():
            print('File {}'.format(fileinput.filename()))
            np.save('data/mesh_count' + str(fileinput.fileno()), mesh_count)
            np.save('data/char_count' + str(fileinput.fileno()), char_count)
            
        line = line.replace(' ', '')
        line = line.replace('	', '')
        line = line.replace('/', '')
        if fileinput.isfirstline():
            print(line)
        for idx, char in enumerate(line):
            if char in num_dict:
                char_count[num_dict[char]] += 1
                if idx + 1 < len(line) and line[idx + 1] in num_dict:
                    mesh_count[num_dict[char]][num_dict[line[idx + 1]]] += 1


print(mesh_count[3619][935])
print(char_count[3619])
np.save('data/mesh_count', mesh_count)
np.save('data/char_count', char_count)

        



Resource files: ['ref/corpus/00-2016-02-utf.txt', 'ref/corpus/01-dgk_shooter_min.txt', 'ref/corpus/02-2016-11.txt', 'ref/corpus/03-dev.txt']
File ref/corpus/00-2016-02-utf.txt


{"html":"原标题：快讯：台湾高雄6.7级地震多幢大楼倒塌不断传出呼救声中国地震台网测定，今日3时57分在台湾高雄市发生6.7级地震。据台媒报道，地震释放能量相当于两颗原子弹。台南市多处楼房倾斜倒塌。其中，台南市永大路二段一栋住宅大楼倒塌，整栋建筑物倒榻在马路上，建筑物内不断传出呼救声。#高雄6.7级地震#","time":"2016-02-0606:45","title":"快讯：台湾高雄6.7级地震多幢大楼倒塌不断传出呼救声","url":"http:news.sina.com.cno2016-02-06doc-ifxpfhzk9008548.shtml"}



File ref/corpus/01-dgk_shooter_min.txt


M畹华吾侄



File ref/corpus/02-2016-11.txt


{"html":"新华社贝鲁特10月31日电（记者李良勇）黎巴嫩国民议会31日举行总统选举投票，基督教政党“自由爱国运动”创始人、议员米歇尔·奥恩当选总统。当天，除一名于数周前辞职的议员外，黎巴嫩议会其余127名议员全部参加投票，超过举行总统选举投票所需的法定人数。根据黎巴嫩宪法，议会只有在三分之二以上议员出席的情况下才能举行总统选举投票。而且，候选人需获得三分之二选票才能当选，否则需举行第二轮投票，在第二轮投票中赢得绝对多数选票者当选。经过两轮投票，奥恩当选新任总统。随后，他在议会宣誓就职。奥恩现年81岁，曾任黎巴嫩武装部队总司令、临时军政府总理。他于1991年至2005年流亡法国，后返回黎巴嫩，参加议会选举并当选议员。黎巴嫩宪法规定，总统由基督教马龙派人士担任，任期6年，不得连选连任。黎巴嫩前总统苏莱曼2014年5月卸任后，黎主要政治派别在总统人选问题上争执不休，导致议会45次推迟举行总统选举投票。（完）责任编辑：刘德宾SN222","time":"2016-11-0100:09","title":"黎巴嫩基督教政党议员米歇尔-奥恩当选总统","url":"http:news.sina.com.cnwzx2016-11-01doc-ifxxfysn8299529.shtml"}



File ref/corpus/03-dev.txt


1myhometown的中文歌词很感动英文版是小田亲自翻译的更加感动求v_url_thanks3



97050
461106


In [26]:
# calc -log of Bayes probability
mesh_count = np.load('data/mesh_count.npy').astype(np.float32)
char_count = np.load('data/char_count.npy').astype(np.float32)

char_count += 0.000001
print(mesh_count[num_dict['清']][num_dict['华']])
print(mesh_count[num_dict['氰']][num_dict['化']])

LAMBDA = 0.934

mesh_prob_log = -np.log(LAMBDA * (mesh_count / char_count.reshape(-1, 1)) +
                        (1 - LAMBDA) * char_count / char_count.sum())


np.save('data/mesh_prob_log', mesh_prob_log)
np.save('data/char_prob_log', -np.log(char_count / char_count.sum()))

print(mesh_prob_log[num_dict['一']][num_dict['只']])
print(mesh_prob_log[num_dict['颐']][num_dict['指']])
print(mesh_prob_log[num_dict['一']][num_dict['直']])


print(mesh_prob_log[num_dict['清']][num_dict['华']])
print(mesh_prob_log[num_dict['氰']][num_dict['化']])


1019.0
129.0


5.312501
2.793043
3.5695128
4.050872
0.44474122
