# 介绍
这里以我在 Github 的开源LSTM 文本分类项目为例子https://github.com/Jinkeycode/keras_lstm_chinese_document_classification
把 `master/data` 目录下的三个文件存放到 Google Drive 上。该示例演示的是对健康、科技、设计三个类别的标题进行分类。

# 安装依赖
Tensorflow、Numpy、sklearn 在 colab 是自带的就不需要安装了

In [37]:
!pip install keras
!pip install jieba
!pip install h5py

import h5py
import jieba as jb
import numpy as np
import keras as krs
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder



# 加载数据

In [28]:
# 安装 PyDrive 操作库，该操作每个 notebook 只需要执行一次
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

def login_google_drive():
  # 授权登录，仅第一次的时候会鉴权
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  return drive

def list_file(drive):
  file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
  for file1 in file_list:
    print('title: %s, id: %s, mimeType: %s' % (file1['title'], file1['id'], file1["mimeType"]))
    

drive = login_google_drive()
list_file(drive)

title: tech.txt, id: 14sDl4520Tpo1MLPydjNBoq-QjqOKk9t6, mimeType: text/plain
title: health.txt, id: 117GkBtuuBP3wVjES0X0L4wVF5rp5Cewi, mimeType: text/plain
title: design.txt, id: 1J4lndcsjUb8_VfqPcfsDeOoB21bOLea3, mimeType: text/plain
title: iris, id: 1M3o-kSs59l0PqLNPmd3XPKyMWxRJG8-vvRtUjHVNpAY, mimeType: application/vnd.google-apps.spreadsheet
title: iris.csv, id: 1SM_fLhCcYRsGxgHproAQ1RLrJAZ_Qcem, mimeType: text/csv
title: Colab Notebooks, id: 1U9363AsQAlJTP2nSeoVae9zDKSsKj5Jj, mimeType: application/vnd.google-apps.folder
title: dped.gz, id: 0BwOLOmqkYj-jeUJwQjRNUFkzOTA, mimeType: application/gzip
title: models+code.gz, id: 0BwOLOmqkYj-jcnZpaDR0dU9XMm8, mimeType: application/x-gzip


In [29]:
def cache_data():
  # id 替换成上一步读取到的对应文件 id
  health_txt = drive.CreateFile({'id': "117GkBtuuBP3wVjES0X0L4wVF5rp5Cewi"}) 
  tech_txt = drive.CreateFile({'id': "14sDl4520Tpo1MLPydjNBoq-QjqOKk9t6"})
  design_txt = drive.CreateFile({'id': "1J4lndcsjUb8_VfqPcfsDeOoB21bOLea3"})
  #这里的下载操作只是缓存，不会在你的Google Drive 目录下多下载一个文件
  
  health_txt.GetContentFile('health.txt', "text/plain")
  tech_txt.GetContentFile('tech.txt', "text/plain")
  design_txt.GetContentFile('design.txt', "text/plain")
  
  print("缓存成功")
  
cache_data()

缓存成功


In [30]:
def load_data():
    titles = []
    print("正在加载健康类别的数据...")
    with open("health.txt", "r") as f:
        for line in f.readlines():
            titles.append(line.strip())

    print("正在加载科技类别的数据...")
    with open("tech.txt", "r") as f:
        for line in f.readlines():
            titles.append(line.strip())


    print("正在加载设计类别的数据...")
    with open("design.txt", "r") as f:
        for line in f.readlines():
            titles.append(line.strip())

    print("一共加载了 %s 个标题" % len(titles))

    return titles
  
titles = load_data()

正在加载健康类别的数据...
正在加载科技类别的数据...
正在加载设计类别的数据...
一共加载了 31318 个标题


In [31]:
def load_label():
    arr0 = np.zeros(shape=[12000, ])
    arr1 = np.ones(shape=[12000, ])
    arr2 = np.array([2]).repeat(7318)
    target = np.hstack([arr0, arr1, arr2])
    print("一共加载了 %s 个标签" % target.shape)

    encoder = LabelEncoder()
    encoder.fit(target)
    encoded_target = encoder.transform(target)
    dummy_target = krs.utils.np_utils.to_categorical(encoded_target)

    return dummy_target
  
target = load_label()

一共加载了 31318 个标签


# 文本预处理

In [32]:
max_sequence_length = 30
embedding_size = 50

# 标题分词
titles = [".".join(jb.cut(t, cut_all=True)) for t in titles]

# word2vec 词袋化
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_sequence_length, min_frequency=1)
text_processed = np.array(list(vocab_processor.fit_transform(titles)))

# 读取词标签
dict = vocab_processor.vocabulary_._mapping
sorted_vocab = sorted(dict.items(), key = lambda x : x[1])

19321


# 构建神经网络
这里使用 Embedding 和 lstm 作为前两层，通过 softmax 激活输出结果

In [39]:
# 配置网络结构
def build_netword(num_vocabs):
    # 配置网络结构
    model = krs.Sequential()
    model.add(krs.layers.Embedding(num_vocabs, embedding_size, input_length=max_sequence_length))
    model.add(krs.layers.LSTM(32, dropout=0.2, recurrent_dropout=0.2))
    model.add(krs.layers.Dense(3))
    model.add(krs.layers.Activation("softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model
  
num_vocabs = len(dict.items())
model = build_netword(num_vocabs=num_vocabs)

import time
start = time.time()
# 训练模型
model.fit(text_processed, target, batch_size=512, epochs=10, )
finish = time.time()
print("训练耗时：%f 秒" %(finish-start))
# 保存模型
# model.save("health_and_tech_design.h5") 不知道为何安装了 h5py 之后，colab 依然提示缺少 h5py

# 加载预训练的模型
# model.load_weights("health_and_tech_design.h5")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 5632/31318 [====>.........................] - ETA: 6s - loss: 0.0809 - acc: 0.9837

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
训练耗时：76.154183 秒


# 预测样本
sen 可以换成你自己的句子，预测结果为[健康类文章概率, 科技类文章概率, 设计类文章概率], 概率最高的为那一类的文章，但最大概率低于0.8时判定为无法分类的文章。

In [40]:
sen = "做好商业设计需要学习的小技巧"
sen_prosessed = " ".join(jb.cut(sen, cut_all=True))
sen_prosessed = vocab_processor.transform([sen_prosessed])
sen_prosessed = np.array(list(sen_prosessed))
result = model.predict(sen_prosessed)

catalogue = list(result[0]).index(max(result[0]))
threshold=0.8
if max(result[0]) > threshold:
    if catalogue == 0:
        print("这是一篇关于健康的文章")
    elif catalogue == 1:
        print("这是一篇关于科技的文章")
    elif catalogue == 2:
        print("这是一篇关于设计的文章")
    else:
        print("这篇文章没有可信分类")

这是一篇关于设计的文章
