## TextCNN 评论分类

![textcnn](./img/textcnn.png)

- **Embedding**：第一层是图中最左边的7乘5的句子矩阵，每行是词向量，维度=5，这个可以类比为图像中的原始像素点。
- **Convolution**：然后经过 kernel_sizes=(2,3,4) 的一维卷积层，每个kernel_size 有两个输出 channel。这里的2，3，4分别表示纵向维度
- **MaxPolling**：第三层是一个1-max pooling层，这样不同长度句子经过pooling层之后都能变成定长的表示。
- **FullConnection and Softmax**：最后接一层全连接的 softmax 层，输出每个类别的概率。

In [6]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

简单测试例子

In [109]:
from keras.models import Sequential
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']

# define class labels
labels = [1,1,1,1,1,0,0,0,0,0]

# integer encode the documents
vocab_size = 50
max_length = 4


# integer encode the documents
vocab_size = 50
encoded_docs = [text.one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = sequence.pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

[[33, 12], [21, 2], [2, 15], [20, 2], [2], [34], [15, 15], [46, 21], [15, 2], [27, 15, 12, 23]]
[[33 12  0  0]
 [21  2  0  0]
 [ 2 15  0  0]
 [20  2  0  0]
 [ 2  0  0  0]
 [34  0  0  0]
 [15 15  0  0]
 [46 21  0  0]
 [15  2  0  0]
 [27 15 12 23]]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_4 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 80.000001


In [7]:
EMBEDDING_FILE = 'E:/MYGIT/model/crawl-300d-2M.vec'

train = pd.read_csv('E:/MYGIT/DataSources/jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('E:/MYGIT/DataSources/jigsaw-toxic-comment-classification-challenge/test.csv')
submission = pd.read_csv('E:/MYGIT/DataSources/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

In [8]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values
X_train_ = X_train

In [10]:
#限制下训练数据大小，本机器配置不行
X_train = X_train[:10000]
X_test = X_test[:10000]
y_train = y_train[:10000]

In [11]:
max_features = 10000
maxlen = 200
embed_size = 300
##把corpus序列化，保存前100000个词作为字典,会分词过滤标点等，只适用于英文
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

#padding使得所有序列一样长,不够的往前填充0，多的保留后200个
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [6]:
print(X_train[0][:10])
print(X_train_[0])
print(tokenizer.word_index['explanation'])
print('------------')
print(len(X_train[10]))

[733, 78, 1, 140, 131, 182, 30, 712, 4438, 10284]
Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
733
------------
503


In [7]:
print(x_train[10], len(x_train[10]))
print('------------')
print(X_train[10][-200:])
print(X_train[10][-200:] == x_train[10])

[   1  272 2267    8  271   31    1 4708 2803   84   92  272   56   66
   16 3532   17  101   17  310    1  120   50  152    4  823  272 3657
  277   22    1  509  181   19   47  237   22    6  333  651    1  486
 3067   25  826   84    1  277   39   16  133    2  960   11  182    1
 2340   22    6  220    1  390 1359    1  434   33   28  302   82   82
    4  277  111   17   25   47    3    1   61  770  440   33   28  123
  272  770  302   82   63   28  123  272  770   12    1  510  161    3
  272  770    9    6   39   82   22    6   19  744   61 1466  411 1464
    9    6   19 2146   92  120    5  917   88  155    6   39  136    4
  161    3 1466    6   19  744   32  451   13  169 1259    5 4872  344
   87   16  146   47  766  151   51   19   57  917   17  957   15  434
   12  293  119   22    1  123    8 1500  182    4  281  200 1157  497
   28  302   82   84    1  123   45   16  146 2109  867  151   22    6
   19   55  254   50  219   88   33    1  390  272  254   29  134    6
   46 

In [9]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
#读取fasttext词向量
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf-8'))

In [11]:
#从fasttext词向量获取训练数据中tokens的所有词向量
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [22]:
type(embedding_matrix)

numpy.ndarray

In [24]:
np.save('embedding_matrix', embedding_matrix)

In [14]:
embedding_matrix = np.load('./np/embedding_matrix.npy')

In [12]:
del embeddings_index

In [13]:
#gc回收
import gc
unreachable_count = gc.collect()

In [15]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [88]:
Embedding??

In [18]:
embedding_matrix  = embedding_matrix[:10000,:]

In [19]:
# max_features = 100000
# maxlen = 200
# embed_size = 300

filter_sizes = [1,2,3,5]
num_filters = 32

def get_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)###变成Conv2D输入格式
    
    #conv_0 = Conv1D(num_filters, kernel_size=kernel_size, strides=1)(x_emb)#这里也可以用conv1D，因为在embed_size
    #等于词向量维度大小，故在列方向相当于没有做卷积操作，使用Conv2D的效果和Conv1D一样
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(6, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [20]:
model = get_model()

batch_size = 256
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

#y_pred = model.predict(x_test, batch_size=1024)
#submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
#submission.to_csv('submission.csv', index=False)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 9500 samples, validate on 500 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.742541 

Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.807495 

Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.860175 



In [10]:
from keras.utils.vis_utils import plot_model

In [14]:
plot_model(model, to_file="./img/text_cnn_model.png",show_shapes=True)

![](./img/text_cnn_model.png)