In [161]:
import paddle
from paddle.io import Dataset
import numpy as np
import random
import paddle.io
from functools import partial

In [162]:
max_seq_len = 85

In [163]:
# 定义数据path
data_path = 'data/news_classify_data.txt'
dic_path = 'data/dict_txt.txt'

In [164]:
# 加载字典
label2id = {}
with open(dic_path,'r',encoding='utf-8') as f:
    label2id = eval(f.read())
id2label = [(wordid,label) for label,wordid in label2id.items() ]
id2label = dict(id2label)

In [189]:
# 获取所有label
names = [ '文化', '娱乐', '体育', '财经','房产', '汽车', '教育', '科技', '国际', '证券']

In [166]:
# 定义Tokenizer
class Tokenizer:
    def __init__(self,label2id) -> None:
        self.label2id = label2id
        id2label = [(wordid,label) for label,wordid in label2id.items() ]
        self.id2label = dict(id2label)
    
    def encode(self,text,max_len):
        encode = []
        for c in text:
            encode.append(self.label2id.get(c,0))
        encode = encode[0:min(len(encode),max_len)]
        if len(encode) < max_len:
            m = [self.pad_id for i in range(max_len-len(encode))]
            encode.extend(m)
        return encode
    
    def decode(self,encode):
        decode = []
        for i in encode:
            decode.append(self.id2label.get(i,'u'))
        return ''.join(decode)

    @property
    def voc_size(self):
        return len(self.label2id)
    
    @property
    def pad_id(self):
        return self.label2id['<pad>']

In [167]:
# 定义转换函数
def convert_sample(data,tokenizer,is_predict = False,max_len = max_seq_len):
    encode = tokenizer.encode(data['text'],max_len)
    label = int(data['label'])
    if not is_predict:
        return np.array(encode).astype(np.int64),np.int64(label)
    else:
        return np.array(encode)

In [168]:
tokenizer = Tokenizer(label2id)
sample_trans = partial(convert_sample,tokenizer = tokenizer,is_predict = False,max_len  = max_seq_len)

In [169]:
class ClassifierDataset(Dataset):

    def load_data(self,data_path):
        with open(data_path,'r',encoding='utf-8') as f:
            lines = [line.strip().split('_!_') for line in f.readlines()]
            data = [{'text':text,'label':label} for _,label,_,text in lines]
        temp = []
        for item in data:
            temp.append(sample_trans(item))
        return temp

    def __init__(self,data_path):
        self.dataset = self.load_data(data_path)
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]

In [170]:
train_set = ClassifierDataset(data_path)

In [171]:
def data_loader(dataset,batch_size = 1,mode = 'train'):
    shuffle = True if mode == 'train' else False

    if mode == 'train':
        batch_sampler = paddle.io.BatchSampler(
            dataset=dataset,batch_size=batch_size,shuffle=shuffle
        )
    else:
        batch_sampler = paddle.io.BatchSampler(
            dataset=dataset,batch_size=batch_size,shuffle=shuffle
        )

    data_loader = paddle.io.DataLoader(dataset,batch_sampler=batch_sampler,return_list=True)

    return data_loader

# 定义网络结构

In [172]:
import paddle.nn

In [173]:
# 定义网络结构的超参数
voc_size = tokenizer.voc_size #
embed_size = 128


In [174]:
class TextClassifier(paddle.nn.Layer):

    def __init__(self,voc_size,embed_size,classes):
        super(TextClassifier,self).__init__()

        # 创建词向量
        self.emb = paddle.nn.Embedding(num_embeddings=voc_size,embedding_dim=embed_size)

        # 卷积层
        self.conv1 = paddle.nn.Conv1D(in_channels=embed_size,out_channels=20,kernel_size=5,stride=2,data_format='NLC')
        # pool层
        self.maxpool1 = paddle.nn.MaxPool1D(kernel_size=2,stride=2)

        # 卷积层
        self.conv2 = paddle.nn.Conv1D(in_channels=embed_size,out_channels=30,kernel_size=5,stride=2,data_format='NLC')
        # pool层
        self.maxpool2 = paddle.nn.MaxPool1D(kernel_size=2,stride=2)

        self.linear = paddle.nn.Linear(in_features=1025,out_features=len(classes))
    
    def forward(self,inputs):
        emb1 = self.emb(inputs)
        
        con1 = self.conv1(emb1)
        pool1 = self.maxpool1(con1)

        con2 = self.conv2(emb1)
        pool2 = self.maxpool2(con2)

        line_inputs = paddle.concat([pool1,pool2],axis=2).reshape([-1,1025])
        # line_inputs = pool2.reshape([-1,615])
        outputs = self.linear(line_inputs)

        return outputs

In [190]:
textClf = TextClassifier(voc_size=voc_size,embed_size=embed_size,classes=names)

In [191]:
paddle.summary(textClf,input_size=(-1,85),dtypes='int64')

---------------------------------------------------------------------------
 Layer (type)       Input Shape          Output Shape         Param #    
 Embedding-17        [[1, 85]]           [1, 85, 128]         606,080    
   Conv1D-29       [[1, 85, 128]]        [1, 41, 20]          12,820     
 MaxPool1D-29      [[1, 41, 20]]         [1, 41, 10]             0       
   Conv1D-30       [[1, 85, 128]]        [1, 41, 30]          19,230     
 MaxPool1D-30      [[1, 41, 30]]         [1, 41, 15]             0       
   Linear-17        [[1, 1025]]            [1, 10]            10,260     
Total params: 648,390
Trainable params: 648,390
Non-trainable params: 0
---------------------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.11
Params size (MB): 2.47
Estimated Total Size (MB): 2.58
---------------------------------------------------------------------------



{'total_params': 648390, 'trainable_params': 648390}

# 模型训练

In [197]:
import paddle.optimizer
import paddle.nn.functional as F

In [199]:
epoch_num = 5
batch_size = 125
train_data = data_loader(train_set,batch_size = batch_size)

In [202]:

# 定义优化器
optimizer = paddle.optimizer.Adagrad(learning_rate=0.01,parameters=textClf.parameters())
# 定义统计指标
metric = paddle.metric.Accuracy()

global_step = 0
textClf.train()
for epoch in range(epoch_num):
    
    for step,data in enumerate(train_data, start=1):
        x_train,y_train = data
        outputs = textClf(x_train)

        prediction = F.softmax(outputs)
        loss = F.cross_entropy(outputs,y_train)
        avg_loss = paddle.mean(loss)

        # 使用softmax计算概率
        probs = F.softmax(outputs,axis=1)
        # 统计
        correct = metric.compute(probs, y_train)
        metric.update(correct)
        acc = metric.accumulate()

        # 打印中间训练结果
        global_step += 1
        if global_step % 10 == 0 :
            print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))

        # 参数更新
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()


        

global step 10, epoch: 0, batch: 10, loss: 1.01823, acc: 0.59040
global step 20, epoch: 0, batch: 20, loss: 1.04618, acc: 0.62560
global step 30, epoch: 0, batch: 30, loss: 1.09615, acc: 0.63893
global step 40, epoch: 0, batch: 40, loss: 0.97268, acc: 0.65400
global step 50, epoch: 0, batch: 50, loss: 0.89807, acc: 0.66224
global step 60, epoch: 0, batch: 60, loss: 0.89844, acc: 0.66800
global step 70, epoch: 0, batch: 70, loss: 1.18668, acc: 0.67154
global step 80, epoch: 0, batch: 80, loss: 0.93560, acc: 0.67760
global step 90, epoch: 0, batch: 90, loss: 0.83602, acc: 0.68400
global step 100, epoch: 0, batch: 100, loss: 0.87337, acc: 0.68528
global step 110, epoch: 0, batch: 110, loss: 0.74424, acc: 0.68829
global step 120, epoch: 0, batch: 120, loss: 0.95913, acc: 0.68860
global step 130, epoch: 0, batch: 130, loss: 0.81763, acc: 0.69280
global step 140, epoch: 0, batch: 140, loss: 0.85846, acc: 0.69583
global step 150, epoch: 0, batch: 150, loss: 0.73787, acc: 0.70000
global step 1