In [14]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"         # 3 is can change to 0-3

import pickle
import keras
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda
from keras.layers.embeddings import Embedding
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D, Input, Dense, Reshape, LSTM
from keras.layers.merge import concatenate, add
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras import backend as K
import tensorflow as tf
import numpy as np
from util.util_functions import getWordIdx
from sklearn.metrics import classification_report, roc_auc_score

# loading processed data

In [2]:
#loading the train_copus_padded data from .pickle file
file = open('pickle_data/train_copus_pad.pickle','rb')
train_copus_padded = pickle.load(file)

file = open('pickle_data/test_copus_pad.pickle','rb')
test_copus_padded = pickle.load(file)

file = open('pickle_data/vocab_train.pickle','rb')
vocab_to_int_train = pickle.load(file)

file = open('pickle_data/embedding_matrix','rb')
embedding_matrix = pickle.load(file)

file = open('pickle_data/train_label.pickle','rb')
train_label = pickle.load(file)

file = open('pickle_data/test_label.pickle','rb')
test_label = pickle.load(file)
file.close()

In [3]:
print('train test data shape:',train_copus_padded.shape, test_copus_padded.shape)
print('embedding_matrix shape:', embedding_matrix.shape)
#the size of vocabulary
vocab_size = len(vocab_to_int_train)
print('vocabulary size:', vocab_size)
# the maximal length of every sentence
maxlen_sent = train_copus_padded.shape[1]
maxlen_word = train_copus_padded.shape[2]
print('max sent length:', maxlen_sent, '\nmax word length:', maxlen_word)

train test data shape: (25000, 36, 224) (25000, 36, 224)
embedding_matrix shape: (97162, 300)
vocabulary size: 97162
max sent length: 36 
max word length: 224


### sentiment word filter construction

In [4]:
# load senti_lexicon extracted from SentiWordNet
file = open('pickle_data/senti_lexicon.pickle','rb')
senti_lexicon = pickle.load(file)
file.close()

In [5]:
#map the sentiment words to integer based on vocab2int
senti2int = [getWordIdx(word, vocab_to_int_train) for word in senti_lexicon]

#get the filter weights based on the sentiment words&vocab2int&embedding_matrix
def Find_Filter_Weight(senti2int):
    """sentiwords is the list"""
    word_filter_weights = []
    bias_weights = []
    filter_len = 1
    for i in senti2int:
        vector = embedding_matrix[i]  # shape: 300
        vector = np.expand_dims(vector, axis=0) #shape: 1x 300
        vector = np.expand_dims(vector, axis=2) #shape: 1x 300 x 1
        if len(word_filter_weights) == 0:
            word_filter_weights = vector
        else:
            word_filter_weights = np.concatenate((word_filter_weights, vector), axis=2)
    #shape is (1, 300, 533)
    
    bias_weights = np.zeros(len(senti2int))
    cnn_wordfilter_weights = [word_filter_weights, bias_weights]
    
    return cnn_wordfilter_weights    

In [6]:
CNN_weights = Find_Filter_Weight(senti2int)
CNN_weights[0].shape

(1, 300, 533)

# keras building model

In [7]:
doc_input_padded = Input(shape=(maxlen_sent,maxlen_word), dtype='int32', name='doc_input_padded')
#每个review的维度
embedding_layer = Embedding(vocab_size, embedding_matrix.shape[1], input_length=maxlen_word, 
                            weights=[embedding_matrix], trainable=False)
#embedding matrix shape[1]是300，每个vector的维度
max_pooling_layer = GlobalMaxPooling1D()

def slice(x, index):
    """ Define a tensor slice function
    """
    return x[:, index, :]

#应该使用自定义的层
#要不然就每次都搭建一个层
cnn_layer1 = Convolution1D(nb_filter=100,
                            filter_length=3,
                            border_mode='same',
                            activation='tanh',
                            subsample_length=1)

# cnn_layer2 = Convolution1D(nb_filter=CNN_weights[0].shape[2],
#                             filter_length=1,
#                             border_mode='same',
#                             activation='tanh',
#                            weights = CNN_weights,
#                            trainable = False,
#                             subsample_length=1)
#************
stack_layer = Lambda(lambda x: K.stack(x, axis=1))

# interate through sentences in a document
cnn_out = []
for i in range(maxlen_sent):
    #以每个影评的每个句子为输入
    sent = Lambda(slice, arguments={'index': i,})(doc_input_padded)
    sent_embedding = embedding_layer(sent)#input shape:(padded_sentence_number),output shape:(nb_words_padded,dimension)
    
    sent_cnn1 = cnn_layer1(sent_embedding) # output shape: (None, maxlen_word, nb_filter)
    # we use standard max over time pooling
    sent_cnn1 = max_pooling_layer(sent_cnn1)  # output shape: (None, nb_filter)
    
#     sent_cnn2 = cnn_layer2(sent_embedding) # output shape: (None, maxlen_word, nb_filter)
#     # we use standard max over time pooling
#     sent_cnn2 = max_pooling_layer(sent_cnn2)  # output shape: (None, nb_filter)
    
#     sent_cnn = concatenate([sent_cnn1, sent_cnn2])
    
    cnn_out.append(sent_cnn1)
cnn_out = stack_layer(cnn_out)  # out shape: (None, maxlen_sent, nb_filter)


lstm = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(cnn_out)
dense = Dense(1, activation='sigmoid')(lstm)

model = Model(inputs=[doc_input_padded], outputs=[dense])
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
doc_input_padded (InputLayer)   (None, 36, 224)      0                                            
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 224)          0           doc_input_padded[0][0]           
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None, 224)          0           doc_input_padded[0][0]           
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, 224)          0           doc_input_padded[0][0]           
__________________________________________________________________________________________________
lambda_5 (

In [8]:
batch_size = 100
epoch_num = 5

print('Training...')
model.fit(train_copus_padded, train_label,
          batch_size=batch_size,
          epochs=epoch_num,
          validation_data=(test_copus_padded, test_label))

# score, acc = model.evaluate(test_copus_padded, test_label,
#                             batch_size=batch_size)
# print('Test accuracy:', acc)

Training...
Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.8983199987411499


In [15]:
pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
# predict the class label
if pred_test_prob.shape[-1]>1:
    pred_test = pred_test_prob.argmax(axis=-1)
else:
    pred_test = (pred_test_prob>0.5).astype('int32')
    pred_test = pred_test.reshape(pred_test.shape[0])

acc = np.sum(pred_test == test_label) / float(len(test_label))

print("Accuracy: %.4f" % (acc))   

print(classification_report(test_label, pred_test, labels=[0, 1]))

Accuracy: 0.8983
             precision    recall  f1-score   support

          0       0.91      0.88      0.90     12500
          1       0.89      0.91      0.90     12500

avg / total       0.90      0.90      0.90     25000

