In [1]:
import os

#### 1 加载数据

In [2]:
train_texts  = open ('train_contents.txt', encoding = 'utf-8').read().split ('\n')
train_labels = open ('train_labels.txt', encoding = 'utf-8').read().split ('\n')
test_texts   = open ("test_contents.txt", encoding = 'utf-8').read().split ('\n')
test_labels  = open ("test_labels.txt", encoding = 'utf-8').read().split('\n')

all_texts  = train_texts + test_texts
all_labels = train_labels + test_labels

In [3]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.16
TEST_SPLIT = 0.2

#### 2 进行序列转换

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts (all_texts)
sequences = tokenizer.texts_to_sequences (all_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 65604 unique tokens.


In [6]:
data = pad_sequences (sequences, maxlen = MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray (all_labels))
print ("shape of data is :", data.shape)
print ("shape of label is :", labels.shape)

shape of data is : (21924, 100)
shape of label is : (21924, 12)


#### 3 随机数据并切分数据

In [7]:
indices = np.arange (data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]

#### 4 获取Word2Vec

In [14]:
import gensim

w2v_model = gensim.models.KeyedVectors.load_word2vec_format('vectors.bin', binary=True)
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
not_in_model = 0
in_model = 0
for word, i in word_index.items(): 
    if str(word) in w2v_model:
        in_model += 1
        embedding_matrix[i] = np.asarray(w2v_model[str(word)], dtype='float32')
    else:
        not_in_model += 1

In [15]:
from keras.layers import Embedding
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

#### 4 搭建CNN网络

In [16]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalMaxPooling1D
from keras.models import Sequential

In [17]:
model = Sequential()
model.add (embedding_layer)
model.add (Dropout(0.2))
model.add (Conv1D(250, 3, padding = 'valid', activation = 'relu', strides = 1))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          13121000  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 200)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 250)           150250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               1600200   
_________________________________________________________________
dense_2 (Dense)              (None, 12)                2412      
Total para

In [18]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)
model.save('cnn.h5')

Train on 14031 samples, validate on 3508 samples
Epoch 1/2
Epoch 2/2


In [19]:
print (model.evaluate(x_test, y_test))

[0.5491907521827463, 0.8456100342075257]
