In [None]:
import os
import pandas as pd
import pickle
from collections import Counter
import jieba.posseg as psg    # posseg可标注词语的词性
from cnradical import Radical, RunOption    # cnradical工具包可获取一个字的偏旁部首和拼音
import shutil
from random import shuffle

In [None]:
train_dir = 'C:/Users/25405/Desktop/KG/train'

### BIO标注及特征提取

In [None]:
# BIO标注
files = os.listdir(train_dir)
files = list(set([file.split('.')[0] for file in files]))
for file in files:
    path = os.path.join(train_dir, file + '.txt')
    tag_path = os.path.join(train_dir, file + '.ann')
    with open(path, 'r', encoding = 'utf8') as f:
        texts = f.readlines()
        texts_list = []
        word_bounds = ['M' for s in texts for x in s]
        word_flags = []
        word_radicals = []
        word_pinyins = []
        radical = Radical(RunOption.Radical)    # 提取偏旁部首
        pinyin = Radical(RunOption.Pinyin)    # 提取拼音
        
        for text in texts:
            for word, flag in psg.cut(text):
                if len(word) == 1:
                    start = len(word_flags)
                    word_bounds[start] = 'S'
                    word_flags.append(flag)
                else:
                    start = len(word_flags)
                    word_bounds[start] = 'B'
                    word_flags += [flag] * len(word)
                    end = len(word_flags) - 1
                    word_bounds[end] = 'E'
        
        for s in texts:
            for x in s:
                texts_list.append(x)
                if radical.trans_ch(x) is not None:
                    word_radicals.append(radical.trans_ch(x))
                else:
                    word_radicals.append('UNK')
                if pinyin.trans_ch(x) is not None:
                    word_pinyins.append(pinyin.trans_ch(x))
                else:
                    word_pinyins.append('UNK')
                    
    with open(tag_path, 'r', encoding = 'utf8') as f2:
        # 获取标签
        tag_list = ['O' for s in texts for x in s]
        tag = pd.read_csv(tag_path, header = None, sep = '\t')
        for i in range(tag.shape[0]):
            tag_item = tag.iloc[i][1].split(' ')
            clas, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])
            tag_list[start] = 'B-' + clas
            for j in range(start + 1, end):
                tag_list[j] = 'I-' + clas
    
    ff = open('C:/Users/25405/Desktop/KG/ttt/train3.txt', 'a', encoding = 'utf-8')
    for i in range(len([x for s in texts for x in s])):
        ff.write(texts_list[i]+' '+word_bounds[i]+' '+word_flags[i]+' '+word_radicals[i]+' '+word_pinyins[i]+' '+tag_list[i]+ '\n')

ff.close()

### K折交叉验证

In [None]:
import numpy as np
from sklearn.model_selection import KFold

In [None]:
with open('C:/Users/25405/Desktop/KG/ttt/train3.txt', 'r', encoding = 'utf-8') as ff3:
    texts = ff3.readlines()
#     print(len(texts)/10 * 7,len(texts)/10 * 9)
#     train_texts = texts[0:245330]
#     ver_texts = texts[245330:315501]
#     test_texts = texts[315501:]
    ar_texts = np.array(texts)
    New_texts = KFold(n_splits = 5)
    t = 1
    for train_index, rest_index in New_texts.split(ar_texts):  # 对ar_texts数据建立5折交叉验证的划分
        split_vt = KFold(n_splits = 2)
        train_texts, rest_texts = ar_texts[train_index], ar_texts[rest_index]
        for ver_index, test_index in split_vt.split(rest_texts):
            ver_texts, test_texts = ar_texts[ver_index], ar_texts[test_index]
        #print(train, train.shape, ver, ver.shape, test, test.shape)
        fff = open('C:/Users/25405/Desktop/KG/ttt/train_' + str(t) + '.txt', 'w', encoding = 'utf-8')
        fff2 = open('C:/Users/25405/Desktop/KG/ttt/var_' + str(t) + '.txt', 'w', encoding = 'utf-8')
        fff3 = open('C:/Users/25405/Desktop/KG/ttt/test_' + str(t) + '.txt', 'w', encoding = 'utf-8')
        fff.writelines(train_texts)
        fff2.writelines(ver_texts)
        fff3.writelines(test_texts)
        t += 1
        fff.close()
        fff2.close()
        fff3.close()

ff3.close()

### 分句，取消空格对应的特征

In [None]:
import codecs

In [None]:
for t in range(1,6):
    f = codecs.open('C:/Users/25405/Desktop/KG/ttt/train_' + str(t) + '.txt', 'r', encoding = 'utf-8')
    line = f.readline()
    l_word = []
    l_bound = []
    l_flag = []
    l_radical = []
    l_pinyin = []
    l_label = []
    while line: 
        if line.split()[4] != 'O':    # 去除word为空的行
            word = line.split()[0]
            bound = line.split()[1]
            flag = line.split()[2]
            radical = line.split()[3]
            pinyin = line.split()[4]
            label = line.split()[5]
            l_word.append(word)
            l_bound.append(bound)
            l_flag.append(flag)
            l_radical.append(radical)
            l_pinyin.append(pinyin)
            l_label.append(label)
            if word == "。":
                l_word.append(" ")
                l_bound.append(" ")
                l_flag.append(" ")
                l_radical.append(" ")
                l_pinyin.append(" ")
                l_label.append(" ")
        line = f.readline()
    f.close()

    for i in range(len(l_word)):
        f1 = open('C:/Users/25405/Desktop/KG/ttt/tvt/train_chinese_no_space_' + str(t) + '.txt', 'a', encoding = 'utf-8')
        f1.write(l_word[i]+' '+l_bound[i]+' '+l_flag[i]+' '+l_radical[i]+' '+l_pinyin[i]+' '+l_label[i]+ '\n')
    f1.close()

### 只保留BIO特征

In [None]:
import codecs

for t in range(1,6):
    f = codecs.open('C:/Users/25405/Desktop/KG/ttt/tvt/train_chinese_no_space_' + str(t) + '.txt', 'r', encoding = 'utf-8')
    line = f.readline()
    l_word = []
    l_label = []
    while line: 
        if line.strip() != "":
            word = line.split()[0]
            label = line.split()[5]
            l_word.append(word)
            l_label.append(label)
        else:
            l_word.append(" ")
            l_label.append(" ")
        line = f.readline()
    f.close()

    for i in range(len(l_word)):
        f1 = open('C:/Users/25405/Desktop/KG/ttt/tvt_BIO/train_chinese_no_space_' + str(t) + '.txt', 'a', encoding = 'utf-8')
        f1.write(l_word[i] + ' ' + l_label[i] + '\n')
    f1.close()

In [None]:
import codecs

for t in range(1,6):
    f = codecs.open('C:/Users/25405/Desktop/KG/ttt/tvt/ver_chinese_no_space_' + str(t) + '.txt', 'r', encoding = 'utf-8')
    line = f.readline()
    l_word = []
    l_label = []
    while line: 
        if line.strip() != "":
            word = line.split()[0]
            label = line.split()[5]
            l_word.append(word)
            l_label.append(label)
        else:
            l_word.append(" ")
            l_label.append(" ")
        line = f.readline()
    f.close()

    for i in range(len(l_word)):
        f1 = open('C:/Users/25405/Desktop/KG/ttt/tvt_BIO/ver_chinese_no_space_' + str(t) + '.txt', 'a', encoding = 'utf-8')
        f1.write(l_word[i] + ' ' + l_label[i] + '\n')
    f1.close()

In [None]:
for t in range(1,6):
    f = codecs.open('C:/Users/25405/Desktop/KG/ttt/tvt/test_chinese_no_space_' + str(t) + '.txt', 'r', encoding = 'utf-8')
    line = f.readline()
    l_word = []
    l_label = []
    while line: 
        if line.strip() != "":    # strip()方法用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列
            word = line.split()[0]
            label = line.split()[5]
            l_word.append(word)
            l_label.append(label)
        else:
            l_word.append(" ")
            l_label.append(" ")
        line = f.readline()
    f.close()

    for i in range(len(l_word)):
        f1 = open('C:/Users/25405/Desktop/KG/ttt/tvt_BIO/test_chinese_no_space_' + str(t) + '.txt', 'a', encoding = 'utf-8')
        f1.write(l_word[i] + ' ' + l_label[i] + '\n')
    f1.close()