In [1]:
import os
import pandas as pd
import pickle
from collections import Counter
import jieba.posseg as psg    # posseg可标注词语的词性
from cnradical import Radical, RunOption    # cnradical工具包可获取一个字的偏旁部首和拼音
import shutil
from random import shuffle

In [None]:
print(pd.__file__)
import sys
sys.path.append('C:/Users/25405/Desktop/KG')
from data_process import split_text

In [None]:
train_dir = 'C:/Users/25405/Desktop/KG/train'

In [None]:
def process_text(idx, split_method = None, split_name = 'train'):
    """
    读取文本,切割,打标记,提取特征
    :param idx: 文件名
    :param split_method: 切割文本的方法
    :param split_name: 最终保存的文件夹名字
    :return:
    """
    data = {}
    
    # 获取句子
    if split_method is None:
        with open(f'{train_dir}/{idx}.txt', 'r', encoding = 'utf-8') as f:
            texts = f.readlines()
    else:
        with open(f'{train_dir}/{idx}.txt', 'r', encoding = 'utf-8') as f:
            texts = f.read()
            texts = split_method(texts) 
    data['word'] = texts
    
#     texts_list = []
#     for s in texts:
#         for x in s:
#             texts_list.append(x)    
    
            
    # 获取标签
    tag_list = ['O' for s in texts for x in s]
    tag = pd.read_csv(f'{train_dir}/{idx}.ann', header = None, sep = '\t')
    for i in range(tag.shape[0]):
        tag_item = tag.iloc[i][1].split(' ')
        clas, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])
        tag_list[start] = 'B-' + clas
        for j in range(start + 1, end):
            tag_list[j] = 'I-' + clas
    assert len([x for s in texts for x in s]) == len(tag_list)
    
    # 提取词性和词边界特征
    word_bounds = ['M' for element in tag_list]
    word_flags = []    # 词性特征
    for text in texts:
        for word, flag in psg.cut(text):
            if len(word) == 1:
                start = len(word_flags)
                word_bounds[start] = 'S'
                word_flags.append(flag)
            else:
                start = len(word_flags)
                word_bounds[start] = 'B'
                word_flags += [flag] * len(word)
                end = len(word_flags) - 1
                word_bounds[end] = 'E'
                
    # 统一截断
    tags = []
    bounds = []
    flags = []
    start = 0
    end = 0
    for s in texts:
        l = len(s)
        end += l
        tags.append(tag_list[start : end])
        bounds.append(word_bounds[start : end])
        flags.append(word_flags[start : end])
        start += l
    data['bound'] = bounds
    data['flag'] = flags
    data['label'] = tags
    
    # 获取偏旁部首和拼音特征
    radical = Radical(RunOption.Radical)    # 提取偏旁部首
    pinyin = Radical(RunOption.Pinyin)    # 提取拼音
    # 没有偏旁部首、拼音的字标上UNK
    data['radical'] = [[radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in s] for s in texts]
    data['pinyin'] = [[pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in s] for s in texts]
    
    # 存储数据
    num_samples = len(texts)
    num_cols = len(data.keys())
    
    dataset = []
    for i in range(num_samples):
        # records = list(zip([list(v[i]) for v in data.values()]))    # 打包为元组的列表
        records = list(zip(*[list(v[i]) for v in data.values()]))
        dataset += records + [['sep'] * num_cols]
    dataset = dataset[:-1]
    dataset = pd.DataFrame(dataset, columns = data.keys())
    save_path = f'C:/Users/25405/Desktop/KG/data/prepare/{split_name}/{idx}.txt'

    def clean_word(w):
        if w == '\n':
            return 'LB'
        if w in [' ', '\t', '\u2003']:   # '\u2003'为中文的空格
            return 'SPACE'
        if w.isdigit():
            return 'num'
        return w
    dataset['word'] = dataset['word'].apply(clean_word)
    dataset.to_csv(save_path, index = False, sep = ' ', encoding = 'utf-8')
    #return texts[0], tags[0], bounds[0], flags[0], data['radical'][0], data['pinyin'][0]
    
def multi_process(split_method = None, train_ratio = 0.8):
    if os.path.exists('C:/Users/25405/Desktop/KG/data/prepare/'):
        shutil.rmtree('C:/Users/25405/Desktop/KG/data/prepare/')    # shutil.rmtree：递归的删除文件
    if not os.path.exists('C:/Users/25405/Desktop/KG/data/prepare/train/'):
        os.makedirs('C:/Users/25405/Desktop/KG/data/prepare/train')
        os.makedirs('C:/Users/25405/Desktop/KG/data/prepare/test')
    idxs = list(set([file.split('.')[0] for file in os.listdir(train_dir)]))
    shuffle(idxs)
    index = int(len(idxs) * train_ratio)    # 训练集的截止下标
    train_ids = idxs[:index]    # 训练集文件名集合
    test_ids = idxs[index:]    # 测试集文件名集合

# #     import multiprocessing as mp    # 引入多进程，用线程池的方式来调用
# #     num_cpus = mp.cpu_count()
# #     pool = mp.Pool(num_cpus)
# #     results = []
# #     for idx in train_ids:
# #         result = pool.apply_async(process_text, args = (idx, split_method, 'train'))    # 异步
# #         results.append(result)
# #     for idx in test_ids:
# #         result = pool.apply_async(process_text, args = (idx, split_method, 'test'))
# #         results.append(result)
# #     pool.close()
# #     pool.join()    # 主进程阻塞等待子进程的退出，join方法要在close或terminate之后使用
# #     [r.get() for r in results]

    results = []
    for idx in train_ids:
        result = process_text(*(idx, split_method, 'train'))
        results.append(result)
    for idx in test_ids:
        result = process_text(*(idx, split_method, 'test'))
        results.append(result)
    [r for r in results]
    
def mapping(l_data, threshold = 10, is_word = False, sep = 'sep', is_label = False):    # 去掉sep
    count = Counter(l_data)    # 统计每个数据出现的个数，返回一个字典
    if sep is not None:
        count.pop(sep)
    if is_word:
        count['PAD'] = 100000001
        count['UNK'] = 100000000
        l_data = sorted(count.items(), key = lambda x:x[1], reverse = True)
        l_data = [x[0] for x in l_data if x[1] >= threshold]
        # 将出现频率小的词设成Unknown
        item = l_data
        item2id = {item[i] : i for i in range(len(item))}
    elif is_label:
        l_data = sorted(count.items(), key = lambda x:x[1], reverse = True)
        l_data = [x[0] for x in l_data]
        item = l_data
        item2id = {item[i] : i for i in range(len(item))}
    else:
        count['PAD'] = 100000001
        l_data = sorted(count.items(), key = lambda x:x[1], reverse = True)
        l_data = [x[0] for x in l_data]
        item = l_data
        item2id = {item[i] : i for i in range(len(item))}
    return item, item2id
    
    
def get_dict():
    map_dict = {}
    from glob import glob    # 遍历文件的一个工具（glob模块用来查找文件目录和文件，glob.glob()可同时获取所有的匹配路径）
    all_word, all_bound, all_flag, all_label, all_radical, all_pinyin = [], [], [], [], [], []
    for file in glob('C:/Users/25405/Desktop/KG/data/prepare/train/*.txt') + \
    glob('C:/Users/25405/Desktop/KG/data/prepare/test/*.txt'):
        df = pd.read_csv(file, sep = ' ')
        all_word += df['word'].tolist()
        all_bound += df['bound'].tolist()
        all_flag += df['flag'].tolist()
        all_label += df['label'].tolist()
        all_radical += df['radical'].tolist()
        all_pinyin += df['pinyin'].tolist()
    map_dict['word'] = mapping(all_word, threshold = 20, is_word = True)    # 返回的是一个元组
    map_dict['bound'] = mapping(all_bound)
    map_dict['flag'] = mapping(all_flag)
    map_dict['label'] = mapping(all_label, is_label = True)
    map_dict['radical'] = mapping(all_radical)
    map_dict['pinyin'] = mapping(all_pinyin)
    
    with open(f'C:/Users/25405/Desktop/KG/data/prepare/dict.pkl', 'wb') as f:
        pickle.dump(map_dict, f)    # 序列化对象

    
if __name__ == '__main__':
    #print(process_text('0', split_method = split_text, split_name = 'train'))
    #print(list(set([file.split('.')[0] for file in os.listdir(train_dir)])))
    multi_process(split_text)
    get_dict()
#     with open(f'C:/Users/25405/Desktop/KG/data/prepare/dict.pkl', 'rb') as f:
#         map_dict = pickle.load(f)
#     print(map_dict['bound'])