In [1]:
import pickle
import keras
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda
from keras.layers.embeddings import Embedding
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D, Input, Dense, Reshape, LSTM
from keras.layers.merge import concatenate, add
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras import backend as K
import tensorflow as tf

Using TensorFlow backend.


# loading processed data

In [2]:
#loading the train_copus_padded data from .pickle file
file = open('pickle_data/train_copus_pad.pickle','rb')
train_copus_padded = pickle.load(file)

file = open('pickle_data/test_copus_pad.pickle','rb')
test_copus_padded = pickle.load(file)

file = open('pickle_data/vocab_train.pickle','rb')
vocab_to_int_train = pickle.load(file)

file = open('pickle_data/embedding_matrix','rb')
embedding_matrix = pickle.load(file)

file = open('pickle_data/train_label.pickle','rb')
train_label = pickle.load(file)

file = open('pickle_data/test_label.pickle','rb')
test_label = pickle.load(file)
file.close()

In [3]:
print('train test data shape:',train_copus_padded.shape, test_copus_padded.shape)
print('embedding_matrix shape:', embedding_matrix.shape)
#the size of vocabulary
vocab_size = len(vocab_to_int_train)
print('vocabulary size:', vocab_size)
# the maximal length of every sentence
maxlen_sent = train_copus_padded.shape[1]
maxlen_word = train_copus_padded.shape[2]
print('max sent length:', maxlen_sent, '\nmax word length:', maxlen_word)

train test data shape: (25000, 36, 224) (25000, 36, 224)
embedding_matrix shape: (100327, 300)
vocabulary size: 100327
max sent length: 36 
max word length: 224


In [None]:
# #*******start to embedding the train_data

# #one by one sentence of each review as input
# #each_review is 36*224
# train_copus_padded = train_copus_padded[:5]
# embedding_layer = Embedding(vocab_size,dimension,weights = [embedding_matrix],input_length = maxlen,trainable = False)
# for review in train_copus_padded:
#     for sent in review:
#         sent = tf.convert_to_tensor(sent)
#         sent = tf.reshape(sent,[1,224])
#         embeded_sequence = embedding_layer(sent)
#         #合并每个vector
#         embeded_sequences = tf.concat(embeded_sequence,0)

# print(embeded_sequences)

# keras building model

In [5]:
doc_input_padded = Input(shape=(maxlen_sent,maxlen_word), dtype='int32', name='doc_input_padded')
#每个review的维度
embedding_layer = Embedding(vocab_size, embedding_matrix.shape[1], input_length=maxlen_word, 
                            weights=[embedding_matrix], trainable=False)
#embedding matrix shape[1]是300，每个vector的维度
max_pooling_layer = GlobalMaxPooling1D()

def slice(x, index):
    """ Define a tensor slice function
    """
    return x[:, index, :]

#应该使用自定义的层
#要不然就每次都搭建一个层
cnn_layer = Convolution1D(nb_filter=100,
                            filter_length=3,
                            border_mode='same',
                            activation='tanh',
                            subsample_length=1)
#************
stack_layer = Lambda(lambda x: K.stack(x, axis=1))

# interate through sentences in a document
cnn_out = []
for i in range(maxlen_sent):
    #以每个影评的每个句子为输入
    sent = Lambda(slice, arguments={'index': i,})(doc_input_padded)
    sent_embedding = embedding_layer(sent)#input shape:(padded_sentence_number),output shape:(nb_words_padded,dimension)
    sent_cnn = cnn_layer(sent_embedding) # output shape: (None, maxlen_word, nb_filter)

    # we use standard max over time pooling
    sent_cnn = max_pooling_layer(sent_cnn)  # output shape: (None, nb_filter)
    cnn_out.append(sent_cnn)
cnn_out = stack_layer(cnn_out)  # out shape: (None, maxlen_sent, nb_filter)




lstm = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(cnn_out)
dense = Dense(1, activation='sigmoid')(lstm)

model = Model(inputs=[doc_input_padded], outputs=[dense])
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
doc_input_padded (InputLayer)   (None, 36, 224)      0                                            
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, 224)          0           doc_input_padded[0][0]           
__________________________________________________________________________________________________
lambda_5 (Lambda)               (None, 224)          0           doc_input_padded[0][0]           
__________________________________________________________________________________________________
lambda_6 (Lambda)               (None, 224)          0           doc_input_padded[0][0]           
__________________________________________________________________________________________________
lambda_7 (

In [6]:
batch_size = 100
epoch_num = 2

print('Training...')
model.fit(train_copus_padded, train_label,
          batch_size=batch_size,
          epochs=epoch_num,
          validation_data=(test_copus_padded, test_label))

score, acc = model.evaluate(test_copus_padded, test_label,
                            batch_size=batch_size)
print('Test accuracy:', acc)

Training...
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2
 3900/25000 [===>..........................] - ETA: 52:56 - loss: 0.2865 - acc: 0.8813

KeyboardInterrupt: 