In [115]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pickle
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

### 数据预处理

In [2]:
def process(path):
    with open(path, 'r') as f:
        lines = f.readlines()
        rlist = []
        for line in lines:
            linelist = line.split()
            source = []
            tag = []
            for w in linelist:
                w = w.strip()
                if len(w)==1:
                    source.append(w)
                    tag.append("S")
                elif len(w) > 1:
                    source.append(w[0])
                    tag.append("B")
                    for i in range(1, len(w)):
                        if i==len(w)-1:
                            tag.append("E")
                        else:
                            tag.append("M")
                        source.append(w[i])
            if len(source) > 0 and len(tag) > 0:
                rlist.append((source, tag))
        return rlist

In [3]:
training_data = process("/home/yinkai/Downloads/pku_training.utf8")

### 构造脚标和字符互相转换的字典

In [4]:
def buildVocab(data):
    UNKNOW = "UNK"
    PAD = "PAD"
    character2index = { UNKNOW:1, PAD:0}
    idx = len(character2index)
    for source, tag in data:
        for c in source:
            if character2index.get(c) is None:
                character2index[c] = idx
                idx += 1
    return character2index

def get_index2character(character2index):
    index2character = {}
    for c, idx in character2index.items():
        index2character[idx] = c
    return index2character

### define parameter

In [109]:
batchSize = 128
n_samples = len(training_data)
embedding_dim = 50
hidden_dim = 30
epoch = 200
learning_rate = 0.01
weight_decay = 1e-5


character2index = buildVocab(training_data)
index2character = get_index2character(character2index)

tag2indx = {"PAD":0, "B":1, "E":2, "S":3, "M":4 }
index2tag = {0:"PAD", 1:"B", 2:"E", 3:"S", 4:"M"}

n_character = len(character2index)
n_tags = len(tag2indx)
padding_idx = character2index["PAD"]

### 随机获取一个Batch的数据

In [6]:
np.random.seed(100)
def getBatch(training_data, batchSize, character2index, tag2indx):
    '''
    获取一个batch数据, 并按照长度逆序排序, rnn求解需要使用
    '''
    n_samples = len(training_data)
    indexs = np.random.choice(n_samples, batchSize)
    
    batch = [training_data[i] for i in indexs]
    lens  = list(map(lambda x:len(x[0]), batch))
    sort = np.argsort(np.array(lens)) ### 从小到大的索引
    invsort = sort[::-1]
    ### 当前batch最大串长度
    maxlen = max(lens)
    batchS = []
    batchT = []
    
    newidx = [indexs[i] for i in invsort]
    newlens = [lens[i] for i in invsort] ### 从大到小的长度
    
    
    for idx in newidx:
        s, t = training_data[idx]
        idxs = []
        idxt = []
        for i in range(len(s)):
            idxs.append(character2index[s[i]])
            idxt.append(tag2indx[t[i]])
        ### padding
        idxs += [character2index["PAD"]] * (maxlen - len(idxs))
        idxt += [tag2indx["PAD"]] * (maxlen - len(idxt))
        
        batchS.append(np.array(idxs))
        batchT.append(np.array(idxt))
    return np.array(batchS), np.array(batchT), newlens

In [7]:
batchSource, batchTag, length = getBatch(training_data, batchSize, character2index, tag2indx)

### 网络结构定义

In [93]:
class LSTMTag(nn.Module):
    def __init__(self, args):
        super(LSTMTag, self).__init__()
        
        self.embedding_dim = args["embedding_dim"]
        self.hidden_dim = args["hidden_dim"]
        self.num_embeddings = args["num_embeddings"]
        self.n_tags = args["n_tags"]
        self.padding_idx = args["padding_idx"]
        
        
        self.embedding  = nn.Embedding(embedding_dim=self.embedding_dim, \
                                       num_embeddings=self.num_embeddings, padding_idx=self.padding_idx)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, batch_first=True)
        
        self.linear = nn.Linear(self.hidden_dim, self.n_tags)
        
        self.softmax = nn.Softmax(dim=self.n_tags)
        
    def forward(self, x, lengths):
        embedded  = self.embedding(x) ### [batch, maxlen, embedding_dim]
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True)
        outputs, hidden =  self.lstm(packed)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) ###[batch, maxlen, hidden_dim]
        
        
        out = self.linear(outputs)
        final_out = self.softmax(out)
        return final_out, output_lengths    

### define network and training

In [116]:
args = {}
args["embedding_dim"] = embedding_dim
args["hidden_dim"] = hidden_dim
args["num_embeddings"] = n_character
args["n_tags"] = n_tags
args["padding_idx"] = padding_idx

lstm_tag = LSTMTag(args=args)

### ignore index指定padding项的index()
lossfunc = nn.CrossEntropyLoss(ignore_index=0)

optimizer = torch.optim.Adam(params=lstm_tag.parameters(), lr=learning_rate, weight_decay=weight_decay)

for i in tqdm(range(epoch)):
    batchSource, batchTag, length = getBatch(training_data, batchSize, character2index, tag2indx)
    outputs, lens = lstm_tag(torch.LongTensor(batchSource), torch.LongTensor(length))
    out = outputs.view(-1, n_tags)
    tag = torch.from_numpy(batchTag)
    tag = tag.long()
    out = outputs.view(-1, n_tags)
    tag = tag.view(-1)
    
    optimizer.zero_grad()
    loss = lossfunc(out, tag)
    if i % 10 == 0:
        print(float(loss))
    loss.backward()
    
    optimizer.step()

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

1.6234896183013916
1.20063054561615
0.9812573194503784
0.8114786744117737
0.7057069540023804
0.6362357139587402
0.5938759446144104
0.5901710987091064
0.5532979965209961
0.5346288681030273
0.5299627184867859
0.5235915184020996
0.5386320948600769
0.5278980135917664
0.5018707513809204
0.48361441493034363
0.5063832402229309
0.5186933279037476
0.4943026006221771
0.4400085210800171



### 保存训练好的模型参数

In [146]:
### 保存模型
torch.save(lstm_tag.state_dict(), "./lstm_tagger.pkl")
### 加载模型

lstm_tagger = LSTMTag(args=args)
lstm_tagger.load_state_dict(torch.load("./lstm_tagger.pkl"))

### 对单个句子进行tagging

In [143]:
def tagging(tagger, sentence, character2index, index2tag):
    """
    parameter:
        tagger:标准器(网络)
        sentence:中文句子
    """
    x = []
    for c in sentence:
        if character2index[c] is None:
            x.append(character2index["UNK"])
        else:
            x.append(character2index[c])
    x = torch.LongTensor(x)
    outputs, _ = tagger(x.unsqueeze(0), torch.LongTensor([len(x)]) ) ### bach=1, len, n_tags
    taglist = outputs[0].max(1)[1]
    tags = []
    for tidx in taglist:
        tags.append(index2tag[int(tidx)])
    return tags

In [148]:
tagging(lstm_tagger, "北京欢迎你", character2index, index2tag)

['B', 'E', 'B', 'E', 'S']