## RNN 文本分类

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.layers.recurrent import SimpleRNN

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
EMBEDDING_FILE = 'E:/MYGIT/model/crawl-300d-2M.vec'

train = pd.read_csv('E:/MYGIT/DataSources/jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('E:/MYGIT/DataSources/jigsaw-toxic-comment-classification-challenge/test.csv')
submission = pd.read_csv('E:/MYGIT/DataSources/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

In [3]:
y_test = pd.read_csv('E:/MYGIT/DataSources/jigsaw-toxic-comment-classification-challenge/test_labels.csv')

In [4]:
y_test = y_test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [37]:
y_test = y_test[:10000]

In [5]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values
X_train_ = X_train

In [7]:
len(X_train), len(X_test)

(159571, 153164)

In [7]:
#限制下训练数据大小，本机器配置不行
X_train = X_train[:10000]
X_test = X_test[:10000]
y_train = y_train[:10000]

In [8]:
max_features = 40000
maxlen = 200
embed_size = 300
##把corpus序列化，保存前100000个词作为字典,会分词过滤标点等，只适用于英文
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

#padding使得所有序列一样长,不够的往前填充0，多的保留后200个
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [9]:
len(x_train), len(y_train)

(159571, 159571)

In [15]:
import pickle
import os

In [16]:
data_root = './np'
pickle_file = os.path.join(data_root, 'toxic_classification.pickle')
try:
    with open(pickle_file, 'wb') as f:
        save = {
        'x_train': x_train,
        'y_train': y_train,
        'x_test': x_test,
        'y_test': y_test,
        'embedding_matrix': embedding_matrix,
        }
        pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)

In [10]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
#读取fasttext词向量
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf-8'))

In [11]:
#从fasttext词向量获取训练数据中tokens的所有词向量
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [18]:
#del embeddings_index
#gc回收
import gc
unreachable_count = gc.collect()

In [13]:
np.save('./np/embedding_matrix_rnn', embedding_matrix)

In [58]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [50]:
# max_features = 100000
# maxlen = 200
# embed_size = 300

filter_sizes = [1,2,3,5]
HIDDNE_SIZE_1 = 256
HIDDNE_SIZE_2 = 128

def get_model():
    ###embedding 和textcnn一样处理
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    
    #conv_0 = Conv1D(num_filters, kernel_size=kernel_size, strides=1)(x_emb)#这里也可以用conv1D，因为在embed_size
    #等于词向量维度大小，故在列方向相当于没有做卷积操作，使用Conv2D的效果和Conv1D一样
    rnn1 = SimpleRNN(HIDDNE_SIZE_1, return_sequences=True, recurrent_dropout= 0.2,input_shape=(maxlen, embed_size))(x)
    
    rnn2 = SimpleRNN(HIDDNE_SIZE_1, return_sequences=False, recurrent_dropout= 0.2,input_shape=(maxlen, HIDDNE_SIZE_1))(rnn1)
    
    z = Dropout(0.2)(rnn2)
    
    fully1 = Dense(HIDDNE_SIZE_2, activation='relu')(z)
    
    outp = Dense(6, activation='sigmoid')(fully1)  
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [51]:
model = get_model()

In [52]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file="./img/text_rnn_model1.png",show_shapes=True)

![](./img/text_rnn_model1.png)

In [53]:
batch_size = 256
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

Train on 9000 samples, validate on 1000 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.751257 

Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.920605 

Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.925773 



## RNN+LSTM 文本分类

In [54]:
from keras.layers import LSTM

In [72]:
# max_features = 100000
# maxlen = 200
# embed_size = 300

HIDDNE_SIZE_1 = 256
HIDDNE_SIZE_2 = 128

def get_model_1():
    ###embedding 和textcnn一样处理
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    
    #conv_0 = Conv1D(num_filters, kernel_size=kernel_size, strides=1)(x_emb)#这里也可以用conv1D，因为在embed_size
    #等于词向量维度大小，故在列方向相当于没有做卷积操作，使用Conv2D的效果和Conv1D一样
    lstm1 = LSTM(HIDDNE_SIZE_1, return_sequences=True, recurrent_dropout= 0.2,input_shape=(maxlen, embed_size))(x)
    
    lstm2 = LSTM(HIDDNE_SIZE_1, return_sequences=False, recurrent_dropout= 0.2,
                      input_shape=(maxlen, HIDDNE_SIZE_1))(lstm1)
    
    z = Dropout(0.2)(lstm2)
    
    fully1 = Dense(HIDDNE_SIZE_2, activation='relu')(z)
    
    outp = Dense(6, activation='sigmoid')(fully1)  
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [73]:
model = get_model_1()
batch_size = 256
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc_1 = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc_1])

Train on 9000 samples, validate on 1000 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.924230 

Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.939407 

Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.962307 



In [74]:
plot_model(model, to_file="./img/text_rnn_lstm_model.png",show_shapes=True)

![](./img/text_rnn_lstm_model.png)

## RNN+GRU 文本分类

In [78]:
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D,concatenate

In [79]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [80]:
model = get_model()
batch_size = 256
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 9000 samples, validate on 1000 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.752879 

Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.849458 

Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.942275 



In [81]:
plot_model(model, to_file="./img/text_rnn_gru1_model.png",show_shapes=True)

![](./img/text_rnn_gru1_model.png)

In [85]:
HIDDNE_SIZE_1 = 256
HIDDNE_SIZE_2 = 128

def get_model():
    ###embedding 和textcnn一样处理
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    
    #conv_0 = Conv1D(num_filters, kernel_size=kernel_size, strides=1)(x_emb)#这里也可以用conv1D，因为在embed_size
    #等于词向量维度大小，故在列方向相当于没有做卷积操作，使用Conv2D的效果和Conv1D一样
    lstm1 = GRU(HIDDNE_SIZE_1, return_sequences=True, recurrent_dropout= 0.2,input_shape=(maxlen, embed_size))(x)
    
    lstm2 = GRU(HIDDNE_SIZE_1, return_sequences=False, recurrent_dropout= 0.2,
                      input_shape=(maxlen, HIDDNE_SIZE_1))(lstm1)
    
    z = Dropout(0.2)(lstm2)
    
    fully1 = Dense(HIDDNE_SIZE_2, activation='relu')(z)
    
    outp = Dense(6, activation='sigmoid')(fully1)  
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [86]:
model = get_model()
batch_size = 256
epochs = 3

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])

Train on 9000 samples, validate on 1000 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.933086 

Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.967686 

Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.975563 



In [87]:
plot_model(model, to_file="./img/text_rnn_gru2_model.png",show_shapes=True)

![](./img/text_rnn_gru2_model.png)