In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
tf.__version__

  from ._conv import register_converters as _register_converters


'2.1.0'

In [17]:
# preprocess data
# training data is downloaded from
# https://github.com/SophonPlus/ChineseNlpCorpus/raw/master/datasets/online_shopping_10_cats/online_shopping_10_cats.zip

data = pd.read_csv('../data/online_shopping_10_cats.csv')
with open('../data/data_for_tokenizer.txt', 'w') as f:
    f.writelines([str(i)+'\n' for i in data['review']])

- train our tokenizer

In [16]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train('--input=../data/data_for_tokenizer.txt --model_prefix=m --vocab_size=5000')

True

In [22]:
# take a look at the tokenizer output

sp = spm.SentencePieceProcessor()
sp.load('./m.model')

sample_review = data['review'][1]
print(sample_review)
print(sp.EncodeAsPieces(sample_review))

True

作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到真理的火花。整本书的结构颇有特点，从当时（本书写于八十年代）流行的计算机话题引入，再用数学、物理学、宇宙学做必要的铺垫——这些内容占据了大部分篇幅，最后回到关键问题：电脑能不能代替人脑。和现在流行的观点相反，作者认为人的某种“洞察”是不能被算法模拟的。也许作者想说，人的灵魂是无可取代的。
['▁', '作者', '真', '有', '英', '国', '人', '严', '谨', '的', '风格', ',', '提出', '观', '点', '、', '进行', '论', '述', '论', '证', ',', '尽', '管', '本人', '对', '物', '理', '学', '了解', '不', '深', ',', '但是', '仍', '然', '能', '感受', '到', '真', '理', '的', '火', '花', '。', '整', '本书', '的', '结', '构', '颇', '有', '特', '点', ',', '从', '当时', '(', '本书', '写', '于', '八', '十', '年', '代', ')', '流', '行', '的', '计', '算', '机', '话', '题', '引', '入', ',', '再', '用', '数', '学', '、', '物', '理', '学', '、', '宇', '宙', '学', '做', '必', '要', '的', '铺', '垫', '——', '这些', '内容', '占', '据', '了', '大', '部分', '篇', '幅', ',', '最后', '回', '到', '关', '键', '问题', ':', '电脑', '能', '不能', '代', '替', '人', '脑', '。', '和', '现在', '流', '行', '的', '观', '点', '相', '反', ',', '作者', '认为', '人的', '某', '种', '“', '洞', '察', '”', '是', '不能', '被', '算', '法', '模', '拟', '的', '。', '也许', '作者', '想', '说', ',', '人的', '灵', '

In [28]:
def encode(text: str):
    return sp.EncodeAsIds(text)

# encode(sample_review)

# TextCNN

![structure](https://img-blog.csdnimg.cn/20190326141457137.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2FzaWFsZWVfYmlyZA==,size_16,color_FFFFFF,t_70)

In [53]:
class TextCNN(keras.models.Model):
    
    def __init__(self, region_sizes=None, filter_size=2, emb_size=20, vocab_size=5000, **kwargs):
        self.region_sizes = region_sizes or [2, 3, 4]
        self.filter_size = filter_size
        self.emb_size = emb_size
        self.vocab_size = vocab_size
        super().__init__(self, **kwargs)
#         self.build()
    
    def build(self, input_shape):
        sent_length = input_shape[1]
        self.emb = keras.layers.Embedding(self.vocab_size, self.emb_size)
        self.conv_layers = [keras.layers.Conv1D(filters=self.filter_size, 
                                                kernel_size=k,
                                                activation='relu',
                                                padding='same') for k in self.region_sizes]
        self.maxpool_layers = [keras.layers.MaxPool1D(sent_length) for _ in range(len(self.region_sizes))]    
        self.fc_layers_1 = keras.layers.Dense(128, activation='relu')
        self.fc_output = keras.layers.Dense(2, activation='softmax')
        super().build(input_shape)
        
    def call(self, input, training=None):
        embeded = self.emb(input)  # [batch_size, sent_length, embedding_size]
        conv_output = [c(embeded) for c in self.conv_layers]  # [batch_size, sent_length, embedding_size]
        maxpool_output = [p(o) for p, o in zip(self.maxpool_layers, conv_output)] # [batch_size, 1, embdding_size]
        
        # concat and reshape
        output = keras.layers.Concatenate(axis=-1)(maxpool_output)
        output = tf.squeeze(output) # [batch_size, emb_size * len(region_size)]
        
        output = self.fc_layers_1(output)
        output = keras.layers.Dropout(rate=0.3)(output, training=training)
        output = self.fc_output(output)
        return output

In [55]:
text_cnn = TextCNN()


sample_data = np.random.randn(20, 50).clip(max=10, min=0)
text_cnn(sample_data.astype(int))

<tf.Tensor: shape=(20, 2), dtype=float32, numpy=
array([[0.500201  , 0.49979898],
       [0.499978  , 0.500022  ],
       [0.500201  , 0.49979898],
       [0.5020413 , 0.4979587 ],
       [0.500201  , 0.49979898],
       [0.500201  , 0.49979898],
       [0.5010025 , 0.4989975 ],
       [0.5016661 , 0.49833384],
       [0.50132185, 0.49867812],
       [0.5015479 , 0.49845216],
       [0.50132185, 0.49867812],
       [0.50132185, 0.49867812],
       [0.500201  , 0.49979898],
       [0.50132185, 0.49867812],
       [0.5001629 , 0.4998371 ],
       [0.49998584, 0.5000141 ],
       [0.5010025 , 0.4989975 ],
       [0.50132185, 0.49867812],
       [0.500201  , 0.49979898],
       [0.50132185, 0.49867812]], dtype=float32)>