In [1]:
import codecs
import pandas as pd
import numpy as np
import re
import collections
import sklearn

In [2]:
def flatten(x): # 展平
    result = []
    for el in x:
        if isinstance(x, collections.Iterable) and not isinstance(el, str):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result


print(flatten(["junk", ["nested stuff"], [], [[]]]))

['junk', 'nested stuff']


In [3]:
def origin2tag():
    input_data = codecs.open('./origindata.txt', 'r', 'utf-8')
    output_data = codecs.open('./wordtag.txt', 'w', 'utf-8')
    for line in input_data.readlines():
        line = line.strip()
        i = 0
        while i < len(line):
            if line[i] == '{':
                i += 2
                temp = ""
                while line[i] != '}':
                    temp += line[i]
                    i += 1
                i += 2
                word = temp.split(':')
                sen = word[1]
                output_data.write(sen[0]+"/B_"+word[0]+" ")
                for j in sen[1:len(sen)-1]:
                    output_data.write(j+"/M_"+word[0]+" ")
                output_data.write(sen[-1]+"/E_"+word[0]+" ")
            else:
                output_data.write(line[i]+"/O ")
                i += 1
        output_data.write('\n')
    input_data.close()
    output_data.close()

In [4]:
def tagsplit():
    with open('./wordtag.txt', 'rb') as inp:
        texts = inp.read().decode('utf-8')
    sentences = re.split('[，。！？、‘’“”（）]/[O]', texts)
    output_data = codecs.open('./wordtagsplit.txt', 'w', 'utf-8')
    for sentence in sentences:
        if sentence != " ":
            output_data.write(sentence.strip()+'\n')
    output_data.close()

In [8]:
def data2pkl():
    datas = list() # 字列表
    labels = list() # 字所对应的标签列表
    linedata = list() # 每句话的字列表
    linelabel = list() # 每句话的字所对应的标签列表
    tags = set() # 标签词典（所有出现过的标签）

    input_data = codecs.open('./wordtagsplit.txt', 'r', 'utf-8')
    for line in input_data.readlines():
        line = line.split()
        linedata = []
        linelabel = []
        numNotO = 0
        for word in line:
            word = word.split('/')
            linedata.append(word[0])
            linelabel.append(word[1])
            tags.add(word[1])
            if word[1] != 'O':
                numNotO += 1
        if numNotO != 0:
            datas.append(linedata)
            labels.append(linelabel)

    input_data.close()
    print(len(datas), tags)
    print(len(labels))
    all_words = flatten(datas)
    sr_allwords = pd.Series(all_words)  # 每个汉字出现的排序
    sr_allwords = sr_allwords.value_counts()  # 统计每个字出现的频次
    # print(sr_allwords)
    set_words = sr_allwords.index
    set_ids = range(1, len(set_words)+1)

    tags = [i for i in tags]
    tag_ids = range(len(tags)) # 标签的id
    word2id = pd.Series(set_ids, index=set_words) # word2id词典
    id2word = pd.Series(set_words, index=set_ids)
    tag2id = pd.Series(tag_ids, index=tags)
    id2tag = pd.Series(tags, index=tag_ids)

    word2id["unknow"] = len(word2id)+1
    # print(word2id)
    max_len = 60 # 设置一句话的最大长度

    def X_padding(words): # 将一句话用词向量表示
        ids = list(word2id[words])
        if len(ids) >= max_len:
            return ids[:max_len]
        ids.extend([0]*(max_len-len(ids)))
        return ids

    def y_padding(tags):
        ids = list(tag2id[tags])
        if len(ids) >= max_len:
            return ids[:max_len]
        ids.extend([0]*(max_len-len(ids)))
        return ids
    df_data = pd.DataFrame(
        {'words': datas, 'tags': labels}, index=range(len(datas)))
    df_data['x'] = df_data['words'].apply(X_padding)
    df_data['y'] = df_data['tags'].apply(y_padding)
    x = np.asarray(list(df_data['x'].values))
    y = np.asarray(list(df_data['y'].values))

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=43)
    x_train, x_valid, y_train, y_valid = train_test_split(
        x_train, y_train,  test_size=0.2, random_state=43)

    import pickle
    import os
    with open('../Bosondata.pkl', 'wb') as outp:
        pickle.dump(word2id, outp)
        pickle.dump(id2word, outp)
        pickle.dump(tag2id, outp)
        pickle.dump(id2tag, outp)
        pickle.dump(x_train, outp)
        pickle.dump(y_train, outp)
        pickle.dump(x_test, outp)
        pickle.dump(y_test, outp)
        pickle.dump(x_valid, outp)
        pickle.dump(y_valid, outp)
    print('** Finished saving the data.')

In [6]:
origin2tag()

In [7]:
tagsplit()

In [22]:
data2pkl()

16753 {'', 'E_time', 'M_company_name', 'E_org_name', 'M_time', 'O', 'B_company_name', 'E_product_name', 'M_location', 'E_company_name', 'E_person_name', 'B_time', 'B_person_name', 'M_product_name', 'M_person_name', 'B_product_name', 'B_org_name', 'M_org_name', 'B_location', 'E_location'}
16753
的            1
1            2
0            3
             4
2            5
在            6
中            7
国            8
年            9
一           10
了           11
日           12
是           13
月           14
大           15
为           16
人           17
3           18
上           19
有           20
5           21
行           22
n           23
市           24
发           25
到           26
时           27
来           28
和           29
会           30
          ... 
萝         3406
視         3407
酣         3408
朵         3409
炒         3410
嘟         3411
仆         3412
榨         3413
掺         3414
幂         3415
伽         3416
濑         3417
慕         3418
睛         3419
爽         3420
蛮         3421


In [43]:
word2id["unknow"] = len(word2id)+1
#print(word2id)
max_len = 60
def X_padding(words):
    ids = list(word2id[words])
    if len(ids) >= max_len:  
        return ids[:max_len]
    ids.extend([0]*(max_len-len(ids))) 
    return ids

def y_padding(tags):
    ids = list(tag2id[tags])
    if len(ids) >= max_len: 
        return ids[:max_len]
    ids.extend([0]*(max_len-len(ids))) 
    return ids
df_data = pd.DataFrame({'words': datas, 'tags': labels}, index=range(len(datas)))
df_data['x'] = df_data['words'].apply(X_padding)
df_data['y'] = df_data['tags'].apply(y_padding)
x = np.asarray(list(df_data['x'].values))
y = np.asarray(list(df_data['y'].values))

In [53]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=43)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train,  test_size=0.2, random_state=43)

In [65]:
fr = open('./test.pkl','rb') 
data1=pickle.load(fr)
data2 = pickle.load(fr) 
data3 = pickle.load(fr) 
data4 = pickle.load(fr) 
data5 = pickle.load(fr) 
data6 = pickle.load(fr) 
data7 = pickle.load(fr) 
data8 = pickle.load(fr) 
data9 = pickle.load(fr) 
data10 = pickle.load(fr) 