In [22]:
import fileinput
import os
import numpy as np
from tqdm.notebook import tqdm
import pickle


In [23]:
# number the characters
num_dict = {}
num = 0
with fileinput.input(files='ref/charlist.txt',
                     openhook=fileinput.hook_encoded('gbk')) as f:
    for line in f: # only one line
        for char in line:
            num_dict[char] = num
            num += 1
print('All {} chars'.format(num))
print(num_dict['中'])
print(num_dict['国'])
print(num_dict['家'])


All 6763 chars
3619
935
1177


In [24]:
# map words to characters' numbers
map_word_num = {}
with fileinput.input(files='ref/word-chars.txt',
                     openhook=fileinput.hook_encoded('gbk')) as f:
    for line in f:
        word, chars = line.split()[0], line.split()[1:]
        num = [num_dict[char] for char in chars]
        map_word_num[word] = num

print(map_word_num['cao'])

with open('data/map_word_num', 'wb') as f:
    pickle.dump(map_word_num, f)

[244, 4960, 247, 4149, 248, 4554, 5064, 245, 6415, 6301, 246]


In [27]:
# count characters and tuples (time-hungry)
mesh_count = np.zeros(shape=(6763, 6763), dtype=np.int32)
char_count = np.zeros(shape=(6763), dtype=np.int32)

res_files = os.listdir('ref/corpus')
if 'README.txt' in res_files:
    res_files.remove('README.txt')
res_files = [os.path.join('ref/corpus', x) for x in res_files]
print('Resource files:', res_files)

with fileinput.input(files=res_files) as f:
    for line in f:
        if fileinput.isfirstline():
            print('File {}'.format(fileinput.filename()))
            np.save('data/mesh_count' + str(fileinput.fileno()), mesh_count)
            np.save('data/char_count' + str(fileinput.fileno()), char_count)

        for idx, char in enumerate(line):
            if char in num_dict:
                char_count[num_dict[char]] += 1
                if idx + 1 < len(line) and line[idx + 1] in num_dict:
                    mesh_count[num_dict[char]][num_dict[line[idx + 1]]] += 1

print(mesh_count[3619][935])
print(char_count[3619])
np.save('data/mesh_count', mesh_count)
np.save('data/char_count', char_count)
    
        


Resource files: ['ref/corpus/2016-11.txt', 'ref/corpus/conversations.yml', 'ref/corpus/emotion.yml', 'ref/corpus/humor.yml', 'ref/corpus/psychology.yml']
File ref/corpus/2016-11.txt


File ref/corpus/conversations.yml


File ref/corpus/emotion.yml


File ref/corpus/humor.yml


File ref/corpus/psychology.yml


11961
48954


In [28]:
# calc -log of Bayes probability
mesh_count = np.load('data/mesh_count.npy')
char_count = np.load('data/char_count.npy')
char_count += 1

ALPHA = 1.2
BETA = 0.1

mesh_prob_log = -np.log(ALPHA * (mesh_count / char_count.reshape(-1, 1)) +
                        BETA * char_count / sum(char_count))


np.save('data/mesh_prob_log', mesh_prob_log)


In [29]:
print(mesh_prob_log[3619][935], mesh_prob_log[3619][950])
print(mesh_prob_log[935][1177], mesh_prob_log[950][1177])



1.2245439035581083 6.93092951834797
1.8897885352198602 7.9638883506457505


In [30]:
print(mesh_prob_log[3222][3598])
print(mesh_prob_log[3222][3597])

5.5901944826186245
15.589776062881741
