In [2]:
# ! pip install paddlenlp

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple






In [1]:
import paddle
import numpy as np
from functools import partial

import paddle.nn as nn
import paddle.nn.functional as F
import paddlenlp as ppnlp
from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab

from utils import convert_example

from paddlenlp.datasets import load_dataset
def read(data_path):
    with open(data_path,'r',encoding='utf-8') as f:
        for line in f :
            l = line.strip('\n').split('\t')
            if len(l) != 2:
                print(len(l),line)
            # python里对字符串对象操作，一般是拷贝操作，原来的不变
            words ,label = line.strip('\n').split('\t')
            yield {'text':words,'label':label}

# lazy = True -> IterDataset ,不一次加载进内存 -> 访问不用下标，不用__getitem__ , for循环直接访问
# False -> MapDataset : 在绝大多数时候都可以满足要求。一般只有在数据集过于庞大无法一次性加载进内存的时候我们才考虑使用 IterDataset 
train_ds = load_dataset(read,data_path='train.txt',lazy=False)
dev_ds = load_dataset(read,data_path='dev.txt',lazy=False)
test_ds = load_dataset(read,data_path='test.txt',lazy=False)
print(test_ds[0])




  from .autonotebook import tqdm as notebook_tqdm


{'text': '楼面经理服务态度极差，等位和埋单都差，楼面小妹还挺好', 'label': '0'}


### MapDataset的加载方式

In [4]:
for i in range(3):
    print(train_ds[i])

{'text': '赢在心理，输在出品！杨枝太酸，三文鱼熟了，酥皮焗杏汁杂果可以换个名（九唔搭八）', 'label': '0'}
{'text': '服务一般，客人多，服务员少，但食品很不错', 'label': '1'}
{'text': '東坡肉竟然有好多毛，問佢地點解，佢地仲話係咁架\ue107\ue107\ue107\ue107\ue107\ue107\ue107冇天理，第一次食東坡肉有毛，波羅包就幾好食', 'label': '0'}


### IterDataset的加载方式

In [5]:
train_ds_iter = load_dataset(read,data_path='train.txt',lazy=True)
# 不用enumerate也可以
# for item in train_ds_iter:
for i,item in enumerate(train_ds_iter):
    if(i>3):
        break
    print(item)

{'text': '赢在心理，输在出品！杨枝太酸，三文鱼熟了，酥皮焗杏汁杂果可以换个名（九唔搭八）', 'label': '0'}
{'text': '服务一般，客人多，服务员少，但食品很不错', 'label': '1'}
{'text': '東坡肉竟然有好多毛，問佢地點解，佢地仲話係咁架\ue107\ue107\ue107\ue107\ue107\ue107\ue107冇天理，第一次食東坡肉有毛，波羅包就幾好食', 'label': '0'}
{'text': '父亲节去的，人很多，口味还可以上菜快！但是结账的时候，算错了没有打折，我也忘记拿清单了。说好打8折的，收银员没有打，人太多一时自己也没有想起。不知道收银员忘记，还是故意那钱露入自己钱包。。', 'label': '0'}


- 下载词汇表文件word_dict.txt，用于构造词-id映射关系。

In [31]:
# ! wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
# windows用这个没下载下来


'wget' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���


# Dataloader

In [2]:
# 加载词表
vocab = Vocab.load_vocabulary('./senta_word_dict.txt',unk_token = '[UNK]',pad_token='[PAD]')

# 使用词表初始化tokenizer
tokenizer = JiebaTokenizer(vocab)   

In [3]:
trans_function = partial(
    convert_example,
    tokenizer = tokenizer,
    is_test=False
)
print("="*30+'test_ds.map(trans_function)')
test_ds.map(trans_function)
print("="*30+'test_ds[0]')
test_ds[0]



(array([ 440695,  764333, 1035593,  371677, 1106339,  995733,  237834,
         891203,  258291, 1106339,  440695,  117037,  936761], dtype=int64),
 array(13, dtype=int64),
 array(0, dtype=int64))

In [5]:
input_ids=[656582, 967208, 318502, 1106339, 1, 693836, 1106328, 728300, 34934, 1106339, 677464, 1168226, 823066, 1106339, 706897, 1078813, 895713, 76982, 660347, 1, 179592, 1106335, 554600, 1, 1106336]
valid_length = np.array(len(input_ids), dtype='int64')
v = np.array(1)
input_ids = np.array(input_ids, dtype='int64')
valid_length,input_ids,v

(array(25, dtype=int64),
 array([ 656582,  967208,  318502, 1106339,       1,  693836, 1106328,
         728300,   34934, 1106339,  677464, 1168226,  823066, 1106339,
         706897, 1078813,  895713,   76982,  660347,       1,  179592,
        1106335,  554600,       1, 1106336], dtype=int64),
 array(1))

In [6]:
# 读入数据，生成mini-batches
def create_dataloader(dataset,
                      trans_function=None,
                      mode='train',
                      batch_size=1,
                      pad_token_id=0,
                      batchify_fn=None):
    # 下面在定义
    if trans_function:
        # 每个样本都应用trans_function里的函数
        dataset_map = dataset.map(trans_function)
    
    # return_list 数据是否以list形式返回
    # collate_fn  指定如何将样本列表组合为mini-batch数据。
    # 传给它参数需要是一个callable对象，需要实现对组建的batch的处理逻辑，并返回每个batch的数据。
    # 在这里传入的是`prepare_input`函数，对产生的数据进行pad操作，并返回实际长度等。
    dataloader = paddle.io.DataLoader(
        dataset_map, # 从这个参数加载数据集
        return_list = True,
        batch_size = batch_size,
        collate_fn=batchify_fn
    )

    return dataloader

#  python中的偏函数partial，把一个函数的某些参数固定住（也就是设置默认值），返回一个新的函数，调用这个新函数会更简单。
trans_function = partial(
    convert_example,
    tokenizer = tokenizer,
    is_test=False
)

# 将读入的数据batch化处理，便于模型batch化运算。
# batch中的每个句子将会padding到这个batch中的文本最大长度batch_max_seq_len。
# 当文本长度大于batch_max_seq时，将会截断到batch_max_seq_len；当文本长度小于batch_max_seq时，将会padding补齐到batch_max_seq_len.

# ：号前是参数，后面是表达式，返回表达式的返回值
# lambda a,b:a+b  
batchify_fn =  lambda samples,fn=Tuple(
    Pad(axis=0,pad_val=vocab['[PAD]']), # input_id
    Stack(dtype='int64'), # seq_len
    Stack(dtype='int64') # label
):[data for data in fn (samples)]

train_loader = create_dataloader(
    train_ds,
    trans_function=trans_function,
    batch_size=128,
    mode='train',
    batchify_fn=batchify_fn
)
dev_loader =create_dataloader(
    dev_ds,
    trans_function=trans_function,
    batch_size=128,
    mode='validation',
    batchify_fn=batchify_fn
)
test_loader =create_dataloader(
    test_ds,
    trans_function=trans_function,
    batch_size=128,
    mode='test',
    batchify_fn=batchify_fn
)

for i in train_loader:
    print(i)
    break


[Tensor(shape=[128, 369], dtype=int64, place=Place(gpu_pinned), stop_gradient=True,
       [[656582 , 967208 , 318502 , ..., 0      , 0      , 0      ],
        [724601 , 1250380, 1106339, ..., 0      , 0      , 0      ],
        [283829 , 250030 , 389886 , ..., 0      , 0      , 0      ],
        ...,
        [278377 , 364676 , 952595 , ..., 0      , 0      , 0      ],
        [137984 , 38435  , 399775 , ..., 0      , 0      , 0      ],
        [115700 , 364716 , 509081 , ..., 0      , 0      , 0      ]]), Tensor(shape=[128], dtype=int64, place=Place(gpu_pinned), stop_gradient=True,
       [25 , 12 , 33 , 57 , 21 , 9  , 65 , 22 , 10 , 9  , 9  , 75 , 9  , 13 ,
        244, 10 , 71 , 9  , 10 , 86 , 30 , 153, 15 , 15 , 21 , 30 , 23 , 98 ,
        13 , 59 , 17 , 18 , 17 , 69 , 116, 192, 16 , 13 , 28 , 204, 28 , 10 ,
        40 , 63 , 369, 12 , 10 , 58 , 15 , 11 , 18 , 32 , 130, 37 , 121, 271,
        35 , 9  , 22 , 40 , 12 , 13 , 175, 23 , 6  , 39 , 32 , 18 , 15 , 8  ,
        25 , 11 , 6

# 模型搭建
使用`LSTMencoder`搭建一个BiLSTM模型用于进行句子建模，得到句子的向量表示。

然后接一个线性变换层，完成二分类任务。

- `paddle.nn.Embedding`组建word-embedding层
- `ppnlp.seq2vec.LSTMEncoder`组建句子建模层
- `paddle.nn.Linear`构造二分类器


<p align="center">
<img src="https://ai-studio-static-online.cdn.bcebos.com/ecf309c20e5347399c55f1e067821daa088842fa46ad49be90de4933753cd3cf" width = "800" height = "450"  hspace='10'/> <br />
</p><br><center>图1：seq2vec示意图</center></br>

* 除LSTM外，`seq2vec`还提供了许多语义表征方法，详细可参考：[seq2vec介绍]

In [8]:
class LSTMModel(nn.Layer):
    def __init__(self,
                 vocab_size,
                 num_classes,
                 emb_dim=128,
                 padding_idx=0,
                 lstm_hidden_size=198,
                 direction='forward',
                 lstm_layers=1,
                 dropout_rate=0,
                 pooling_type=None,
                 fc_hidden_size=96):
        super().__init__()

        # 首先将输入word id 查表后映射成 word embedding
        self.embedder = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=emb_dim,
            padding_idx=padding_idx)

        # 将word embedding经过LSTMEncoder变换到文本语义表征空间中
        self.lstm_encoder = ppnlp.seq2vec.LSTMEncoder(
            emb_dim,
            lstm_hidden_size,
            num_layers=lstm_layers,
            direction=direction,
            dropout=dropout_rate,
            pooling_type=pooling_type)

        # LSTMEncoder.get_output_dim()方法可以获取经过encoder之后的文本表示hidden_size
        self.fc = nn.Linear(self.lstm_encoder.get_output_dim(), fc_hidden_size)

        # 最后的分类器
        self.output_layer = nn.Linear(fc_hidden_size, num_classes)

    def forward(self, text, seq_len):
        # text shape: (batch_size, num_tokens)
        # print('input :', text.shape)
        
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)
        # print('after word-embeding:', embedded_text.shape)

        # Shape: (batch_size, num_tokens, num_directions*lstm_hidden_size)
        # num_directions = 2 if direction is 'bidirectional' else 1
        text_repr = self.lstm_encoder(embedded_text, sequence_length=seq_len)
        # print('after lstm:', text_repr.shape)


        # Shape: (batch_size, fc_hidden_size)
        fc_out = paddle.tanh(self.fc(text_repr))
        # print('after Linear classifier:', fc_out.shape)

        # Shape: (batch_size, num_classes)
        logits = self.output_layer(fc_out)
        # print('output:', logits.shape)
        
        # probs 分类概率值
        probs = F.softmax(logits, axis=-1)
        # print('output probability:', probs.shape)
        return probs

model= LSTMModel(
        len(vocab),
        2,
        direction='bidirectional',
        padding_idx=vocab['[PAD]'])
model = paddle.Model(model)

# 模型配置和训练

In [9]:
optimizer = paddle.optimizer.Adam(
        parameters=model.parameters(), learning_rate=5e-5)

loss = paddle.nn.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

model.prepare(optimizer, loss, metric)

- 设置visualdl路径

In [10]:
# 设置visualdl路径
log_dir = './visualdl'
callback = paddle.callbacks.VisualDL(log_dir=log_dir)

## 启动训练

In [11]:
model.fit(train_loader, dev_loader, epochs=10, save_dir='./checkpoints', save_freq=5, callbacks=callback)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10
step  10/125 - loss: 0.6931 - acc: 0.4648 - 356ms/step
step  20/125 - loss: 0.6931 - acc: 0.4773 - 234ms/step
step  30/125 - loss: 0.6925 - acc: 0.4974 - 193ms/step
step  40/125 - loss: 0.6916 - acc: 0.5039 - 171ms/step
step  50/125 - loss: 0.6893 - acc: 0.5091 - 160ms/step
step  60/125 - loss: 0.6939 - acc: 0.5091 - 151ms/step
step  70/125 - loss: 0.6927 - acc: 0.5104 - 145ms/step
step  80/125 - loss: 0.6890 - acc: 0.5092 - 141ms/step
step  90/125 - loss: 0.6923 - acc: 0.5092 - 138ms/step
step 100/125 - loss: 0.6874 - acc: 0.5106 - 135ms/step
step 110/125 - loss: 0.6845 - acc: 0.5105 - 133ms/step
step 120/125 - loss: 0.6869 - acc: 0.5096 - 131ms/step
step 125/125 - loss: 0.6844 - acc: 0.5120 - 128ms/step
save checkpoint at e:\Document\CodeSpace\Study\DeepL\Process_data\checkpoints\0
Eval begin...
step 10/84 - loss: 0.6854 - acc: 0.6367 - 96ms/step
step 20/84 - loss:

In [12]:
results = model.evaluate(dev_loader)
print("Finally test acc: %.5f" % results['acc'])

Eval begin...
step 10/84 - loss: 0.3611 - acc: 0.9586 - 98ms/step
step 20/84 - loss: 0.3526 - acc: 0.9629 - 85ms/step
step 30/84 - loss: 0.3572 - acc: 0.9635 - 81ms/step
step 40/84 - loss: 0.3378 - acc: 0.9652 - 78ms/step
step 50/84 - loss: 0.3616 - acc: 0.9652 - 77ms/step
step 60/84 - loss: 0.3309 - acc: 0.9660 - 76ms/step
step 70/84 - loss: 0.3587 - acc: 0.9653 - 75ms/step
step 80/84 - loss: 0.3739 - acc: 0.9645 - 73ms/step
step 84/84 - loss: 0.3276 - acc: 0.9648 - 70ms/step
Eval samples: 10644
Finally test acc: 0.96477


# 预测

In [13]:
print(type(test_ds))
label_map = {0: 'negative', 1: 'positive'}
results = model.predict(test_loader, batch_size=128)[0]
predictions = []

for batch_probs in results:
    # 映射分类label
    idx = np.argmax(batch_probs, axis=-1)
    idx = idx.tolist()
    labels = [label_map[i] for i in idx]
    predictions.extend(labels)

<class 'paddlenlp.datasets.dataset.MapDataset'>
Predict begin...
Predict samples: 5353


In [14]:
# 看看预测数据前5个样例分类结果
for i in test_ds:
    print(i)
    break
    
for idx, data in enumerate(test_ds):
    if idx < 10:
        print(type(data))
        print('Data: {} \t Label: {}'.format(data[0], predictions[idx]))

(array([ 440695,  764333, 1035593,  371677, 1106339,  995733,  237834,
        891203,  258291, 1106339,  440695,  117037,  936761], dtype=int64), array(13, dtype=int64), array(0, dtype=int64))
<class 'tuple'>
Data: [ 440695  764333 1035593  371677 1106339  995733  237834  891203  258291
 1106339  440695  117037  936761] 	 Label: negative
<class 'tuple'>
Data: [ 471791  936825 1022922  761432  891564 1057229  549892  859353 1106326
  653811  176187  877695  958129  173188  986608 1106339  781255  830165
  213378  535515   36026 1106326 1106328   25529  749968 1106339  147848
  830171  489131  958129 1106339  479899  930707  173188  399212       1
 1222901  508478  823066 1106326 1106328  651025  869365       1  681075
 1106339  453143  830172  790286 1051917  173188  681075  173401  412947
  747344 1106326 1106328] 	 Label: negative
<class 'tuple'>
Data: [ 451938  696658  748698  302748  936106  308649  157793  718272  660347
   40882   86562  510099 1106339 1050713  321211   69882  68