In [1]:
MAX_SEQUENCE_LENGTH = 200 # 每条新闻最大长度
EMBEDDING_DIM = 300 # 词向量空间维度
VALIDATION_SPLIT = 0.2 # 验证集占训练集比例
#TEST_SPLIT = 0.3 # 测试集占全数据集比例

In [2]:
#import os
import pandas as pd
import numpy as np
#import inspect
from tqdm import trange
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
from sklearn.preprocessing import LabelEncoder as LE
from keras.models import load_model
from sklearn.model_selection import train_test_split as tts
from time import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
output=pd.read_csv('./output.csv')#已经在BOW_SVM中预处理好的文本数据

X=output['doc'].tolist()
y=output['label'].tolist()

le = LE()
y=le.fit_transform(y)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(y))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

x_train, x_test, y_train, y_test = tts(data, labels, test_size=0.3, random_state=42)
print ('train docs: '+str(len(x_train)))
print ('test docs: '+str(len(x_test)))

Found 106298 unique tokens.
Shape of data tensor: (18847, 200)
Shape of label tensor: (18847, 20)
train docs: 13192
test docs: 5655


In [22]:
len(word_index)

106298

# 载入预训练词向量

In [4]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, BatchNormalization, Activation, Concatenate, Add, GlobalMaxPooling1D
from keras.models import Sequential, Model
from keras.utils import plot_model
import gensim

#w2v_model = gensim.models.Word2Vec.load('./en_1000_no_stem/en.model')
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)



# 开始训练

In [7]:
start=time()
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in w2v_model:
        embedding_matrix[i] = np.asarray(w2v_model[word],
                                         dtype='float32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)#trainable的意义是什么？

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
#model.add(GlobalMaxPooling1D())
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))
model.summary()
#plot_model(model, to_file='model.png',show_shapes=True)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.fit(x_train, y_train, validation_split=VALIDATION_SPLIT, epochs=5, batch_size=128)
loss,accuracy = model.evaluate(x_test,y_test, verbose=0)
end=time()
print('Test score:', loss)
print('Test accuracy:', accuracy)
print("training and test time:{:.2f}s".format(end-start))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          31889700  
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 300)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 198, 250)          225250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 66, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16500)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               4950300   
_________________________________________________________________
dense_2 (Dense)              (None, 20)                6020      
Total para

Test score: 0.6508339991198505
Test accuracy: 0.8272325375879054
training and test time:985.90s


# 使用merge方法

In [8]:
from keras.layers import GlobalMaxPooling1D,GlobalMaxPooling2D

In [9]:
start=time()
convs = []
 
filter_sizes = [2,3,4,5]

input_img = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedded_sequences = embedding_layer(input_img)
for x in filter_sizes:
    
    l_conv = Conv1D(nb_filter=250,filter_length=x,activation='relu')(embedded_sequences)
    #l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
 
    l_pool = MaxPooling1D(5)(l_conv)
 
    convs.append(l_pool)
    
l_merge = Concatenate()(convs) 

l_flat = Flatten()(l_merge)

l_dense = Dense(EMBEDDING_DIM, activation='relu')(l_flat)
 
preds = Dense(labels.shape[1], activation='softmax')(l_dense)

merge_model = Model(input_img, preds)

merge_model.summary() 

merge_model.compile(loss='categorical_crossentropy',
 
              optimizer='adam',
 
              metrics=['acc'])

merge_model.fit(x_train, y_train, validation_split=VALIDATION_SPLIT, epochs=5, batch_size=128)

loss,accuracy = merge_model.evaluate(x_test,y_test, verbose=0)

end=time()

print('Test score:', loss)
print('Test accuracy:', accuracy)
print("training and test time:{:.2f}s".format(end-start))

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     31889700    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 199, 250)     150250      embedding_1[1][0]                
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 198, 250)     225250      embedding_1[1][0]                
__________________________________________________________________________________________________
conv1d_4 (

Epoch 5/5
Test score: 0.6086685462613531
Test accuracy: 0.8608311228263072
training and test time:2282.86s


# 敏感性分析

In [5]:
def sen_test(test_slt,MAX_SEQUENCE_LENGTH):
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    x_train, x_test, y_train, y_test = tts(data, labels, test_size=test_slt, random_state=42)
    print ('train docs: '+str(len(x_train)))
    print ('test docs: '+str(len(x_test)))
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items(): 
        if word in w2v_model:
            embedding_matrix[i] = np.asarray(w2v_model[word],
                                             dtype='float32')
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.2))
    model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
    #model.add(GlobalMaxPooling1D())
    model.add(MaxPooling1D(3))
    model.add(Flatten())
    model.add(Dense(EMBEDDING_DIM, activation='relu'))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()
    #plot_model(model, to_file='model.png',show_shapes=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.fit(x_train, y_train, validation_split=VALIDATION_SPLIT, epochs=5, batch_size=128)
    loss,accuracy = model.evaluate(x_test,y_test, verbose=0)
    #print('Test score:', loss)
    print('test size is:',test_slt)
    print('max seq length is:',MAX_SEQUENCE_LENGTH)
    print('Test accuracy:', accuracy)
    print("\n"*2)

In [None]:
test_split_list=[0.1,0.2,0.3]
max_len_list=[100,150,200,250,300]
start=time()
for ts in test_split_list:
    for msl in max_len_list:
        sen_test(ts,msl)
end=time()
print("sensitive test time:{:.2f}s".format(end-start))

train docs: 16962
test docs: 1885
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 300)          31889700  
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 32, 250)           0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 300)               2400300   
_________________________________________________________________
dense_10 (Dense)             (None, 20)   

Epoch 4/5
Epoch 5/5


test size is: 0.1
max seq length is: 100
Test accuracy: 0.8435013260386034



train docs: 16962
test docs: 1885
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 150, 300)          31889700  
_________________________________________________________________
dropout_5 (Dropout)          (None, 150, 300)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 148, 250)          225250    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 49, 250)           0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 12250)             0         
_________________________________________________________________
dense_11 (Dense)             (None, 300)               3675300   
______________________________

Epoch 3/5
Epoch 4/5


Epoch 5/5
test size is: 0.1
max seq length is: 150
Test accuracy: 0.8535809022678305



train docs: 16962
test docs: 1885
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 200, 300)          31889700  
_________________________________________________________________
dropout_6 (Dropout)          (None, 200, 300)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 198, 250)          225250    
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 66, 250)           0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 16500)             0         
_________________________________________________________________
dense_13 (Dense)             (None, 300)               4950300   
____________________

Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5


test size is: 0.1
max seq length is: 200
Test accuracy: 0.85517241420417



train docs: 16962
test docs: 1885
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 250, 300)          31889700  
_________________________________________________________________
dropout_7 (Dropout)          (None, 250, 300)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 248, 250)          225250    
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 82, 250)           0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 20500)             0         
_________________________________________________________________
dense_15 (Dense)             (None, 300)               6150300   
________________________________

Epoch 3/5
Epoch 4/5


Epoch 5/5
test size is: 0.1
max seq length is: 250
Test accuracy: 0.8546419095929801



train docs: 16962
test docs: 1885
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 300, 300)          31889700  
_________________________________________________________________
dropout_8 (Dropout)          (None, 300, 300)          0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 298, 250)          225250    
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 99, 250)           0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 24750)             0         
_________________________________________________________________
dense_17 (Dense)             (None, 300)               7425300   
____________________

Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5


test size is: 0.1
max seq length is: 300
Test accuracy: 0.8567639255080994



train docs: 15077
test docs: 3770
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 300)          31889700  
_________________________________________________________________
dropout_9 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 32, 250)           0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 8000)              0         
_________________________________________________________________
dense_19 (Dense)             (None, 300)               2400300   
______________________________

Epoch 4/5
Epoch 5/5
test size is: 0.2
max seq length is: 100
Test accuracy: 0.8233421751611745



train docs: 15077
test docs: 3770
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 150, 300)          31889700  
_________________________________________________________________
dropout_10 (Dropout)         (None, 150, 300)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 148, 250)          225250    
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 49, 250)           0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 12250)             0         
_________________________________________________________________
dense_21 (Dense)             (None, 300)               3675300   
__________

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


test size is: 0.2
max seq length is: 150
Test accuracy: 0.840053050271396



train docs: 15077
test docs: 3770
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 200, 300)          31889700  
_________________________________________________________________
dropout_11 (Dropout)         (None, 200, 300)          0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 198, 250)          225250    
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 66, 250)           0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 16500)             0         
_________________________________________________________________
dense_23 (Dense)             (None, 300)               4950300   
_______________________________

Epoch 4/5
Epoch 5/5
test size is: 0.2
max seq length is: 200
Test accuracy: 0.8448275860804145



train docs: 15077
test docs: 3770
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 250, 300)          31889700  
_________________________________________________________________
dropout_12 (Dropout)         (None, 250, 300)          0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 248, 250)          225250    
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 82, 250)           0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 20500)             0         
_________________________________________________________________
dense_25 (Dense)             (None, 300)               6150300   
__________

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


test size is: 0.2
max seq length is: 250
Test accuracy: 0.8416445622077355



train docs: 15077
test docs: 3770
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 300, 300)          31889700  
_________________________________________________________________
dropout_13 (Dropout)         (None, 300, 300)          0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 298, 250)          225250    
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 99, 250)           0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 24750)             0         
_________________________________________________________________
dense_27 (Dense)             (None, 300)               7425300   
______________________________

Epoch 4/5
Epoch 5/5
test size is: 0.2
max seq length is: 300
Test accuracy: 0.8557029176770218



train docs: 13192
test docs: 5655
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 100, 300)          31889700  
_________________________________________________________________
dropout_14 (Dropout)         (None, 100, 300)          0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, 32, 250)           0         
_________________________________________________________________
flatten_15 (Flatten)         (None, 8000)              0         
_________________________________________________________________
dense_29 (Dense)             (None, 300)               2400300   
__________

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




因为上一次运行到中间出现内存不足(已运行约3小时)，我们从断点(0.3,100)继续：

In [6]:
start=time()
max_len_list=[100,150,200,250,300]
for msl in max_len_list:
    sen_test(0.3,msl)
    
end=time()
print("sensitive test time:{:.2f}s".format(end-start))

train docs: 13192
test docs: 5655
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          31889700  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               2400300   
_________________________________________________________________
dense_2 (Dense)              (None, 20)   

Epoch 4/5
Epoch 5/5


test size is: 0.3
max seq length is: 100
Test accuracy: 0.8152077807355623



train docs: 13192
test docs: 5655
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 300)          31889700  
_________________________________________________________________
dropout_2 (Dropout)          (None, 150, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 148, 250)          225250    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 49, 250)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 12250)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 300)               3675300   
______________________________

Epoch 3/5
Epoch 4/5


Epoch 5/5
test size is: 0.3
max seq length is: 150
Test accuracy: 0.8344827585890692



train docs: 13192
test docs: 5655
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 300)          31889700  
_________________________________________________________________
dropout_3 (Dropout)          (None, 200, 300)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 198, 250)          225250    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 66, 250)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 16500)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 300)               4950300   
____________________

Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5


test size is: 0.3
max seq length is: 200
Test accuracy: 0.830415561376263



train docs: 13192
test docs: 5655
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 250, 300)          31889700  
_________________________________________________________________
dropout_4 (Dropout)          (None, 250, 300)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 248, 250)          225250    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 82, 250)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 20500)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 300)               6150300   
_______________________________

Epoch 3/5
Epoch 4/5


Epoch 5/5
test size is: 0.3
max seq length is: 250
Test accuracy: 0.8397877982925462



train docs: 13192
test docs: 5655
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 300, 300)          31889700  
_________________________________________________________________
dropout_5 (Dropout)          (None, 300, 300)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 298, 250)          225250    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 99, 250)           0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 24750)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 300)               7425300   
____________________

Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5


test size is: 0.3
max seq length is: 300
Test accuracy: 0.8311229000146362



sensitive test time:4843.24s
