In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
tf.__version__

  from ._conv import register_converters as _register_converters


'2.1.0'

In [8]:
# preprocess data
# training data is downloaded from
# https://github.com/SophonPlus/ChineseNlpCorpus/raw/master/datasets/online_shopping_10_cats/online_shopping_10_cats.zip

data = pd.read_csv('../data/online_shopping_10_cats.csv')
# with open('../data/data_for_tokenizer.txt', 'w') as f:
#     f.writelines([str(i)+'\n' for i in data['review']])

In [9]:
data.shape

(62774, 3)

- train our tokenizer

In [11]:
import sentencepiece as spm

train_command = """
--input=../data/data_for_tokenizer.txt --model_prefix=m --vocab_size=100000
"""

spm.SentencePieceTrainer.Train(train_command.strip())

True

In [17]:
# take a look at the tokenizer output
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load('./m.model')

sample_review = data['review'][5]
print('原文:')
print(sample_review)
print()
print('分词后:')
print(sp.EncodeAsPieces(sample_review))

True

原文:
作者有一种专业的谨慎，若能有幸学习原版也许会更好，简体版的书中的印刷错误比较多，影响学者理解，全书结构简单，但内容详实，学起来如鱼得水非常轻松。这只是一项技术而已，若可以结合本专业，将会得到更高的学习快乐，家财万贯不如一技在身，一技在身不如一念在心，本书有不仅有技，而且有念。书中佳品。

分词后:
['▁作者', '有一种', '专业的', '谨慎', ',', '若', '能有幸', '学习', '原版', '也许', '会更好', ',', '简体', '版的', '书中的', '印刷错误', '比较多', ',', '影响', '学者', '理解', ',', '全书', '结构', '简单', ',', '但内容', '详实', ',', '学起来', '如鱼得水', '非常', '轻松', '。', '这', '只是', '一项', '技术', '而已', ',', '若', '可以', '结合', '本', '专业', ',', '将会', '得到', '更高的', '学习', '快乐', ',', '家', '财', '万贯', '不如', '一技在身', ',', '一技在身', '不如', '一念', '在', '心', ',', '本书', '有', '不仅有', '技', ',', '而且有', '念', '。', '书中', '佳品', '。']


In [16]:
def encode(text: str):
    encoded = sp.EncodeAsIds(text)
    return encoded

encoded = [encode(i) for i in data['review'].astype(str)]
encoded = keras.preprocessing.sequence.pad_sequences(encoded, maxlen=200, padding='pre')
encoded.shape

(62774, 200)

# TextCNN

![structure](https://img-blog.csdnimg.cn/20190326141457137.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2FzaWFsZWVfYmlyZA==,size_16,color_FFFFFF,t_70)

This paper has shown a multi-layer TextCNN architecture, with multiple Conv-BN-Relu Block.

https://arxiv.org/pdf/1801.06287.pdf

In [75]:
class TextCNN(keras.models.Model):
    
    def __init__(self, region_sizes=None, filter_size=2, emb_size=20, vocab_size=5000, **kwargs):
        self.region_sizes = region_sizes or [2, 3, 4]
        self.filter_size = filter_size
        self.emb_size = emb_size
        self.vocab_size = vocab_size
        super().__init__(self, **kwargs)
#         self.build()
    
    def build(self, input_shape):
        sent_length = input_shape[1]
        self.emb = keras.layers.Embedding(self.vocab_size, self.emb_size)
        self.conv_layers = [keras.layers.Conv1D(filters=self.filter_size, 
                                                kernel_size=k,
                                                activation='relu',
                                                padding='same') for k in self.region_sizes]
        self.maxpool_layers = [keras.layers.MaxPool1D(sent_length) for _ in range(len(self.region_sizes))]    
        self.fc_layers_1 = keras.layers.Dense(128, activation='relu')
        self.fc_output = keras.layers.Dense(2, activation='softmax')
        super().build(input_shape)
        
    def call(self, input, training=None):
        embeded = self.emb(input)  # [batch_size, sent_length, embedding_size]
        conv_output = [c(embeded) for c in self.conv_layers]  # [batch_size, sent_length, embedding_size]
        maxpool_output = [p(o) for p, o in zip(self.maxpool_layers, conv_output)] # [batch_size, 1, embdding_size]
        
        # concat and reshape
        output = keras.layers.Concatenate(axis=-1)(maxpool_output)
        output = keras.layers.Flatten()(output) # [batch_size, emb_size * len(region_size)]
        
        output = self.fc_layers_1(output)
        output = keras.layers.Dropout(rate=0.3)(output, training=training)
        output = self.fc_output(output)
        return output

In [82]:
text_cnn = TextCNN()

In [72]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded, data['label'], test_size=0.3, 
                                                    stratify=data['label'], random_state=1024)
X_train.shape, X_test.shape

((43941, 200), (18833, 200))

In [77]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, keras.utils.to_categorical(y_train)))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, keras.utils.to_categorical(y_test)))

train_dataset = train_dataset.batch(32, drop_remainder=True)

In [78]:
sample_data = next(iter(train_dataset))
text_cnn(sample_data[0])

<tf.Tensor: shape=(32, 2), dtype=float32, numpy=
array([[0.49919888, 0.50080115],
       [0.4969505 , 0.50304943],
       [0.49848965, 0.5015103 ],
       [0.49731615, 0.50268376],
       [0.49868202, 0.501318  ],
       [0.500227  , 0.499773  ],
       [0.4983687 , 0.5016313 ],
       [0.4988659 , 0.50113416],
       [0.49807504, 0.50192493],
       [0.49875778, 0.5012422 ],
       [0.50001603, 0.49998394],
       [0.4986122 , 0.5013877 ],
       [0.49960265, 0.5003973 ],
       [0.4994362 , 0.50056386],
       [0.4998263 , 0.5001737 ],
       [0.49859416, 0.50140584],
       [0.49901065, 0.5009893 ],
       [0.49835193, 0.50164807],
       [0.49991155, 0.5000884 ],
       [0.4993303 , 0.50066966],
       [0.49856788, 0.5014321 ],
       [0.49830568, 0.50169426],
       [0.49938536, 0.50061464],
       [0.49940613, 0.5005939 ],
       [0.501117  , 0.498883  ],
       [0.5000554 , 0.49994466],
       [0.50042486, 0.49957517],
       [0.49873164, 0.5012683 ],
       [0.4976803 , 0.50231

In [83]:
text_cnn.compile(optimizer=keras.optimizers.Adam(), 
                 loss='categorical_crossentropy')

text_cnn.fit(X_train, keras.utils.to_categorical(y_train), epochs=10, batch_size=32)

Train on 43941 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x14f7067f0>