# 文本分类

	本次实践将使用Python语言结合paddlepaddle深度学习框架，创建卷积神经网络实现新闻标题的分类模型。

    数据来源：从网站上爬取56821条数据中文新闻摘要

    数据内容：包含10种类别，国际、文化、娱乐、体育、财经、汽车、教育、科技、房产、证券
![](https://ai-studio-static-online.cdn.bcebos.com/1789ed5939d24134b9ce4d45ca15e0fdf21f25aa400c4f6898cc4e02aea5c74e)

# **1、准备数据:**

    创建数据集和数据字典

    创建数据读取器train_reader 和test_reader

# **2、配置网络**

定义网络

定义损失函数

定义优化算法

# **3、训练网络**

# **4、模型评估**

# **5、模型预测**


In [3]:
# 查看当前挂载的数据集目录
!ls /home/aistudio/data/
#将数据移动到 /home/aistudio/data/ 目录下
!cp data/data6825/news_classify_data.txt data/

data6825  news_classify_data.txt


In [4]:
# 导入必要的包
import os
from multiprocessing import cpu_count
import numpy as np
import shutil
import paddle
import paddle.fluid as fluid

![](https://ai-studio-static-online.cdn.bcebos.com/54a8321ecd084ae5b659d73b0b1e58bc1e45835d07b242e1a2fc5b46bbae77f1)

In [5]:
s = [["我",1],["是",3],["谁",8]]
print(s)
d = dict(s)
print(d)
d['他'] = 5
d1 = {"你":2,"他":4}
d.update(d1) # 使用新字典里面的数据，更新原字典，若是原字典有的，则更新，若是没有的，则新增
print(d)

[['我', 1], ['是', 3], ['谁', 8]]
{'我': 1, '是': 3, '谁': 8}
{'我': 1, '是': 3, '谁': 8, '他': 4, '你': 2}


In [6]:
s = 'aaa bbb'
for i in  s:
    print(i)

a
a
a
 
b
b
b


In [7]:
# 创建数据集和数据字典

data_root_path='data'

# 创建验证的数据列表，测试的数据列表
def create_data_list(data_root_path):
    with open(data_root_path + 'test_list.txt', 'w') as f:
        pass
    with open(data_root_path + 'train_list.txt', 'w') as f:
        pass

    with open(os.path.join(data_root_path, 'dict_txt.txt'), 'r', encoding='utf-8') as f_data:
        dict_txt = eval(f_data.readlines()[0]) # 将读取的数据（dictionary），
                                               # 并通过eval函数转为python的字典对象，eval将内容作为python语句执行，并返回执行结果

    with open(os.path.join(data_root_path, 'news_classify_data.txt'), 'r', encoding='utf-8') as f_data:
        lines = f_data.readlines() 
        i = 0
        for line in lines:
            title = line.split('_!_')[-1].replace('\n', '') # 使用 _!_ 分割字符串，并将末尾的换行符进行替换
            l = line.split('_!_')[1] # 获数据的类型
            labs = ""
            if i % 10 == 0: # 如果是当前数据的条数是10的整数倍，将其添加到测试数据集合
                with open(os.path.join(data_root_path, 'test_list.txt'), 'a', encoding='utf-8') as f_test:# join函数会根据当前操作系统的类型，自动生成文件链接的字符串
                    # 打开测试数据
                    for s in title:
                        lab = str(dict_txt[s]) # 获取标题的编码
                        labs = labs + lab + ',' # 将编码添加到编码的字符串中
                    labs = labs[:-1]
                    labs = labs + '\t' + l + '\n' # 编码\t类型\n
                    print(title + "\n" + labs)
                    f_test.write(labs)
            else:
                with open(os.path.join(data_root_path, 'train_list.txt'), 'a', encoding='utf-8') as f_train:
                    for s in title:
                        lab = str(dict_txt[s])
                        labs = labs + lab + ','
                    labs = labs[:-1]
                    labs = labs + '\t' + l + '\n' # 编码\t类型\n
                    f_train.write(labs)
            i += 1
    print("数据列表生成完成！")


# 把下载得数据生成一个字典
def create_dict(data_path, dict_path):
    dict_set = set()
    # 读取已经下载得数据
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # 把数据生成一个集合
    for line in lines:
        title = line.split('_!_')[-1].replace('\n', '') # 取分割后的数组的最后一个
        #print(title)
        for s in title:
            dict_set.add(s) # 集合中的元素是不能重复的
    # 把集合转换成字典，一个字对应一个数字
    dict_list = []
    i = 0
    for s in dict_set:
        dict_list.append([s, i])
        #print([s,i])
        i += 1
    # 添加未知字符
    dict_txt = dict(dict_list)
    end_dict = {"<unk>": i}
    dict_txt.update(end_dict)
    # 把这些字典保存到本地中
    with open(dict_path, 'w', encoding='utf-8') as f:
        f.write(str(dict_txt))

    print("数据字典生成完成！")


# 获取字典的长度
def get_dict_len(dict_path):
    with open(dict_path, 'r', encoding='utf-8') as f:
        line = eval(f.readlines()[0])
    print(type(line))
    return len(line.keys())


if __name__ == '__main__':
    # 把生成的数据列表都放在自己的总类别文件夹中
    data_root_path = "data"
    data_path = os.path.join(data_root_path, 'news_classify_data.txt') # 存储分类号的数据
    dict_path = os.path.join(data_root_path, "dict_txt.txt")
    # 创建数据字典
    create_dict(data_path, dict_path)
    
    # 创建数据列表
    create_data_list(data_root_path)
    get_dict_len(dict_path)

<class 'dict'>

In [8]:
d = {"a":1,"b":2,"c":3}
print(len(d.keys()))

3


创建好的字典：
![](https://ai-studio-static-online.cdn.bcebos.com/ecad7b7163334648b174929a03c9624b860b9224b8a84e08816c1e9146b06d93)

创建好的数据列表：
![](https://ai-studio-static-online.cdn.bcebos.com/843d51728e0647a79fa83fc7cedbccbf9b8e5b9adaab42db90a414d003a35dd8)




paddle.reader.xmap_readers():通过多线程方式，通过用户自定义的映射器mapper来映射reader返回的样本（到输出队列)。

In [9]:
 p = [x for x in range(10)] # 使用每一次循环产生的x创建数组，
 p

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [10]:
# 创建数据读取器train_reader 和test_reader
# 训练/测试数据的预处理
'''
def data_mapper(sample):
    data, label = sample
    data = [int(data) for data in data.split(',')]
    return data, int(label)
'''

# 创建数据读取器train_reader
def train_reader(train_list_path):
    def reader():
        with open(train_list_path, 'r') as f: # 打开测试数据集
            lines = f.readlines()  #
            # 打乱数据
            np.random.shuffle(lines) # his function only shuffles the array along the first axis of a multi-dimensional array. The order of sub-arrays is changed but their contents remains the same.
            # 开始获取每个文章标题的编码和对应的类型
            for line in lines:
                data, label = line.split('\t')
                data = [int(data) for data in data.split(',')]
                yield data, int(label) # 代表创建的是生成器，每次调用这个生成器，会返回一个data（在这里代表标题的编码）以及一个labe，在这里代表标题的分类
    return reader #yield 代表创建的是生成器
    #return paddle.reader.xmap_readers(data_mapper, reader, cpu_count(), 1024)
#  创建数据读取器test_reader
def test_reader(test_list_path):

    def reader():
        with open(test_list_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                data, label = line.split('\t')
                data = [int(data) for data in data.split(',')]
                yield data, int(label)
    return reader
    #return paddle.reader.xmap_readers(data_mapper, reader, cpu_count(), 1024)

In [11]:
a = "3654,3039,2426,3195,3515,2763,2554,4561,2505,2397,4381,3778,359,2798,4397,1145	0"
data,label = a.split("\t")
print(data)
data = [int(data) for data in data.split(',')]
print(data)

3654,3039,2426,3195,3515,2763,2554,4561,2505,2397,4381,3778,359,2798,4397,1145
[3654, 3039, 2426, 3195, 3515, 2763, 2554, 4561, 2505, 2397, 4381, 3778, 359, 2798, 4397, 1145]


In [12]:
def test_ye(n):
    a,b = 1,1
    for i in range(n):
        a,b = b,a+b # 先对后面做运算，运算的结果返回给表达式之前的变量
        yield b
print(test_ye(4))
for item in test_ye(10):
    print(item)

<generator object test_ye at 0x7faecb467cd0>
2
3
5
8
13
21
34
55
89
144


# 卷积神经网络（Convolutional Neural Networks, CNN）

输入词向量序列，产生一个特征图（feature map），对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征，最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。

在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵，这样可以更高效的完成运算。

另外，我们也可使用窗口大小不同的卷积核来处理句子.

![](https://ai-studio-static-online.cdn.bcebos.com/3766261f24b54514b6cbc0d30270c6a3f38c1d0aaf8f450c97e8303eca51f204)

In [13]:
# 定义CNN，
def convolution_net(data,input_dim,class_dim = 10,emb_dim = 128, hid_dim = 128, hid_dim2= 128):
    emb = fluid.layers.embedding(
        input = data,size=[input_dim,emb_dim],is_sparse=True
    )
    con_3 = fluid.nets.sequence_conv_pool(
        input= emb,
        num_filters=hid_dim,
        filter_size=3,
        act='tanh',
        pool_type='sqrt'
    )
    con_4 = fluid.nets.sequence_conv_pool(
        input= emb,
        num_filters=hid_dim2,
        filter_size=4,
        act='tanh',
        pool_type='sqrt'
    )
    prediction = fluid.layers.fc(
        input = [con_3,con_4],size=class_dim,act='softmax'
    )

    return prediction

In [33]:
def inference_program(word_dic):
    data = fluid.layers.data(
        name='words',shape=[1],dtype='int64',lod_level=1
    )
    dict_dim = get_dict_len(word_dic)
    net = convolution_net(data,dict_dim)
    return net

In [34]:
def train_program(prediction):
    label = fluid.layers.data(
        name = 'label',shape=[1],dtype='int64'
    )
    cost = fluid.layers.cross_entropy(input=prediction,label=label)
    avg_cost = fluid.layers.mean(cost)
    acc = fluid.layers.accuracy(input= prediction,label=label)
    return [avg_cost,acc]

In [16]:
def optimizer_func():
    return fluid.optimizer.Adagrad(learning_rate=0.002)

In [17]:
use_cuda = False  #在cpu上进行训练
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)

In [37]:
prediction = inference_program('data/dict_txt.txt')
[avg_cost,acc] = train_program(prediction)


<class 'dict'>


In [39]:
optimizer = optimizer_func()
optimizer.minimize(avg_cost)

([inputs {
    parameter: "Grad"
    arguments: "embedding_13.w_0@GRAD"
  }
  inputs {
    parameter: "LearningRate"
    arguments: "learning_rate_0"
  }
  inputs {
    parameter: "Moment"
    arguments: "embedding_13.w_0_moment_0"
  }
  inputs {
    parameter: "Param"
    arguments: "embedding_13.w_0"
  }
  outputs {
    parameter: "MomentOut"
    arguments: "embedding_13.w_0_moment_0"
  }
  outputs {
    parameter: "ParamOut"
    arguments: "embedding_13.w_0"
  }
  type: "adagrad"
  attrs {
    name: "op_callstack"
    type: STRINGS
    strings: "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/framework.py\", line 1771, in append_op\n    attrs=kwargs.get(\"attrs\", None))\n"
    strings: "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/optimizer.py\", line 1230, in _append_optimize_op\n    stop_gradient=True)\n"
    strings: "  File \"/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/pa

In [40]:
# 获取训练数据读取器和测试数据读取器
train_reader = paddle.batch(reader=train_reader('/home/aistudio/data/train_list.txt'), batch_size=128)
test_reader = paddle.batch(reader=test_reader('/home/aistudio/data/test_list.txt'), batch_size=128)

In [41]:
sampledata = next(train_reader())
print(sampledata[0])

([4461, 191, 2337, 4087, 2404, 1437, 3225, 2552, 3171, 3063, 1637, 586, 154, 148, 3716, 857, 4423, 1637, 857, 4423, 971, 3399, 1010, 2618, 4338, 1959, 693, 3063, 2654], 2)


In [43]:
# 定义数据映射器
feeder = fluid.DataFeeder(place=place, feed_list=['words','label'])

In [44]:
 t =[sampledata[0],sampledata[1],sampledata[2]]

In [45]:
for i in t:
    print(len(i[0]))

29
29
26


In [49]:
#参数初始化
exe.run(fluid.default_startup_program())

[]

In [50]:
test_program = fluid.default_main_program().clone(for_test=True)

In [48]:
EPOCH_NUM=10
model_save_dir = '/home/aistudio/work/infer_model/'
# 开始训练

for pass_id in range(EPOCH_NUM):
    # 进行训练
    for batch_id, data in enumerate(train_reader()):
        train_cost, train_acc = exe.run(program=fluid.default_main_program(),
                             feed=feeder.feed(data),
                             fetch_list=[avg_cost, acc])
        
        if batch_id % 100 == 0:
            print('Pass:%d, Batch:%d, Cost:%0.5f, Acc:%0.5f' % (pass_id, batch_id, train_cost[0], train_acc[0]))
    # 进行测试
    test_costs = []
    test_accs = []
    for batch_id, data in enumerate(test_reader()):
        test_cost, test_acc = exe.run(program=test_program,
                                              feed=feeder.feed(data),
                                              fetch_list=[avg_cost,acc])
        test_costs.append(test_cost[0])
        test_accs.append(test_acc[0])
    # 计算平均预测损失在和准确率
    test_cost = (sum(test_costs) / len(test_costs))
    test_acc = (sum(test_accs) / len(test_accs))
    print('Test:%d, Cost:%0.5f, ACC:%0.5f' % (pass_id, test_cost, test_acc))

# 保存预测模型
if not os.path.exists(model_save_dir): 
    os.makedirs(model_save_dir) 
fluid.io.save_inference_model(model_save_dir, 
                            feeded_var_names=[words.name], 
                            target_vars=[model], 
                            executor=exe)
print('训练模型保存完成！') 

Pass:0, Batch:0, Cost:2.30502, Acc:0.12500


In [None]:
fluid.layers.Print(words)

In [None]:
# 用训练好的模型进行预测并输出预测结果
# 创建执行器
infer_exe = fluid.Executor(place)    #创建推测用的executor

inference_scope = fluid.core.Scope() #Scope指定作用域

save_path = '/home/aistudio/work/infer_model/'

with fluid.scope_guard(inference_scope):#修改全局/默认作用域（scope）, 运行时中的所有变量都将分配给新的scope。

# 从模型中获取预测程序、输入数据名称列表、分类器
    [infer_program, feeded_var_names, target_var] = fluid.io.load_inference_model(dirname=save_path, executor=exe)


# 获取数据
    def get_data(sentence):
        # 读取数据字典
        with open('/home/aistudio/data/dict_txt.txt', 'r', encoding='utf-8') as f_data:
            dict_txt = eval(f_data.readlines()[0])
        dict_txt = dict(dict_txt)
        # 把字符串数据转换成列表数据
        keys = dict_txt.keys()
        data = []
        for s in sentence:
            # 判断是否存在未知字符
            if not s in keys:
                s = '<unk>'
            data.append(int(dict_txt[s]))
        return data


    data = []
    # 获取预测数据
    data1 = get_data('在获得诺贝尔文学奖7年之后，莫言15日晚间在山西汾阳贾家庄如是说')
    data2 = get_data('综合“今日美国”、《世界日报》等当地媒体报道，芝加哥河滨警察局表示，')
    data.append(data1)
    data.append(data2)
    print(data)
    # 获取每句话的单词数量
    base_shape = [[len(c) for c in data]]

    # 生成预测数据
    tensor_words = fluid.create_lod_tensor(data, base_shape, place)
    print(tensor_words)
    # 执行预测
    result = infer_exe.run(program=infer_program,
                    feed={feeded_var_names[0]: tensor_words},
                    fetch_list=target_var)

    # 分类名称
    names = [ '文化', '娱乐', '体育', '财经','房产', '汽车', '教育', '科技', '国际', '证券']
    print(np.argsort(result))
    # 获取结果概率最大的label
    for i in range(len(data)):
        lab = np.argsort(result)[0][i][-1]
        print('预测结果标签为：%d， 名称为：%s， 概率为：%f' % (lab, names[lab], result[0][i][lab]))