## loading and preprocessing data

In [1]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
from smart_open import smart_open
import re
import nltk
import numpy as np
import pickle
import itertools
from collections import Counter, OrderedDict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.preprocessing.sequence import pad_sequences
import re
from os import listdir
import string
from nltk.corpus import stopwords
from util.util_functions import getWordIdx, load_embedding_matrix_gensim
import gensim
import pickle
import keras
import tensorflow as tf
from keras.layers import Conv1D,Embedding,MaxPooling1D,Input
from keras.models import Model
import pandas as pd
from bs4 import BeautifulSoup
import string
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [3]:
#define all of the functions
punctuation_list = list(string.punctuation)

# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
#     norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\[\].\",()!?;:/])", " \\1 ", norm_text)
    return norm_text

def sent_tokenize(doc):
    sent_text = nltk.sent_tokenize(doc) # this gives you a list of sentences
    return sent_text

def word_tokenize(sent):
    tokenized_text = nltk.word_tokenize(sent)  # this gives you a list of words
    tokenized_text = [token.lower() for token in tokenized_text if token not in punctuation_list]  
    # optional: convert all words to lower case
    return tokenized_text

def readfile(filename):
    with open(filename,'r',encoding='UTF-8') as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    #strip()读出有效文件，形成一个list
    #split()读成有效文件，根据一行来形成一个list
    return content

#padding the sentence
#sentences是一个影评，就是一个train_data_word[0]
#max_words是影评中句子的最大含词量
#max_sents是影评中最大的句子个数
#保证每个影评的句子个数和句子长度都一样
def pad_sent(sentences, max_words, max_sents):
    """
    Pads sequences to the same length.
    Input: sentences - List of lists, where each element is a sequence.
    - max_words: Int, maximum length of all sequences.
    """
    # pad sentences in a doc
    sents_padded = pad_sequences(sentences, maxlen=max_words, padding='post') 
    # pad a doc to have equal number of sentences
    if len(sents_padded) < max_sents:
        doc_padding = np.zeros((max_sents-len(sents_padded),max_words), dtype = int)
        sents_padded = np.append(doc_padding, sents_padded, axis=0)
    else:
        sents_padded = sents_padded[:max_sents]
    return sents_padded

#build from word to integer as the input of ''
def build_vocab(corpus):
    """
    Builds a vocabulary mapping from word to index based on the corpus.
    Input: list of all samples in the training data
    Return: OrderedDict - vocabulary mapping from word to integer.
    """
    # Build vocabulary
    corpus_2d = []  # convert 3d corpus to 2d list
    for doc in corpus:
        for sent in doc:
            corpus_2d.append(sent)
    word_counts = Counter(itertools.chain(*corpus_2d))
    # Mapping from index to word (type: list)
    vocabulary = ['<PAD/>', '<UKN/>']   # 0 for padding, 1 for unknown words
    vocabulary = vocabulary + [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    #如何避免呢
    vocab2int = OrderedDict({x: i for i, x in enumerate(vocabulary)})
    return vocab2int

#****这个corpus是几维呢
def build_input_data(corpus, vocab2int, max_words, max_sents):
    """
    Maps words in the corpus to integers based on a vocabulary.
    Also pad the sentences and documents into fixed shape
    Input: corpus - list of samples, each sample is a list of sentences, each sentence is a list of words
    """
    corpus_int = [[[getWordIdx(word, vocab2int) for word in sentence]for sentence in sample] for sample in corpus]
    corpus_padded = []
    for doc in corpus_int:
        corpus_padded.append(pad_sent(doc, max_words, max_sents))
    corpus_padded = np.array(corpus_padded)    
    return corpus_padded

def load_embedding_matrix_gensim(embed_path, vocab2int, EMBEDDING_DIM):
    """
    load Word2Vec using gensim: 300x1 word vecs from Google (Mikolov) word2vec: GoogleNews-vectors-negative300.bin
    return embedding_matrix 
    embedding_matrix[i] is the embedding for 'vocab2int' integer index i
    """
    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(embed_path, binary=True)
    embeddings = {}
    embeddings['<PAD/>'] = np.zeros(EMBEDDING_DIM) # Zero vector for '<PAD/>' word
    embedding_UKN = np.random.uniform(-0.10, 0.10, EMBEDDING_DIM)  # Vector of small random numbers for unknown words
    # embedding_UKN = vector / np.linalg.norm(embedding_UKN)   # Normalize to unit vector
    embeddings['<UKN/>'] = embedding_UKN

    for word in word2vec_model.vocab:
        embeddings[word] = word2vec_model[word]

    embedding_matrix = np.zeros((len(vocab2int) , EMBEDDING_DIM))
    for word, i in vocab2int.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:   # word is unknown
            embedding_vector = np.random.uniform(-0.10, 0.10, EMBEDDING_DIM)  # Vector of small random numbers for unknown words
            # embedding_vector = vector / np.linalg.norm(embedding_vector)   # Normalize to unit vector
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [4]:
#**********loading data
#get the movie review "list of string"
path_pos = 'data/unprocessed/sorted_data/apparel/positive.review'
path_neg = 'data/unprocessed/sorted_data/apparel/negative.review'

#read the file
file_pos = open(path_pos,'r',encoding='windows-1252')
file_pos = file_pos.read()

file_neg = open(path_neg,'r',encoding='windows-1252')
file_neg = file_neg.read()

#extact the file
positive = BeautifulSoup(file_pos)
positive = positive.find_all('review_text')#get all of the positive reviews
for i in range(len(positive)):#convet the elements in postive to the string type
    positive[i] = str(positive[i])

negative = BeautifulSoup(file_neg)
negative = negative.find_all('review_text')#get all the positive reviews
for i in range(len(negative)):#convet the elements in negative to the string type
    negative[i] = str(negative[i])
    
#eliminate the <review_text></review_text>tag in the reviews and normalize the text
positive_doc = []
for review in positive:
    review = normalize_text(review[14:-15])
    review = review.lower()
    positive_doc.append(review)

negative_doc = []
for review in negative:
    review = normalize_text(review[14:-15])
    review = review.lower()
    negative_doc.append(review)
    
#merge the data
data = negative_doc + positive_doc

#get the train label
pos_label = [1 for i in range(len(positive_doc))]
neg_label = [0 for i in range(len(negative_doc))]
train_label = pos_label + neg_label

In [None]:
# #split the data to train and test dataset
# train_copus_padded, test_copus_padded, train_label, test_label = train_test_split(
#     copus_padded, train_label, test_size=0.2, random_state=42)

# #get the label
# train_num = int(len(positive_doc)*0.8)
# test_num = int(len(positive_doc)*0.2)
# train_pos_label = [1 for i in range (train_num)]
# train_neg_label = [0 for i in range(train_num)]
# test_pos_label = [1 for i in range(test_num)]
# test_neg_label = [0 for i in range(test_num)]

# #get the data
# train_pos_doc = positive_doc[0:800]
# train_neg_doc = negative_doc[0:800]
# test_pos_doc = positive_doc[800:]
# test_neg_doc = negative_doc[800:]

# #merge the data
# train_data = train_pos_doc + train_neg_doc
# train_label = train_pos_label + train_neg_label
# test_data = test_pos_doc + test_neg_doc
# test_label = test_pos_label + test_neg_label

# #shuffle the data
# from sklearn.utils import shuffle 
# train_data , train_label = shuffle(train_data , train_label , random_state = 0) 
# test_data, test_label = shuffle(test_data, test_label, random_state = 0)

## process data for hierarchical model 

In [5]:
#tokenize the doc list to the list of sentences
data_sent = [sent_tokenize(data[i]) for i in range(len(data))]

#tokenize the two lists of list of sentences into two lists of list of list of word
data_word = [[]for i in range(len(data_sent))]
for i in range(len(data_sent)):
    for j in range(len(data_sent[i])):
        word_tokens = word_tokenize(data_sent[i][j])
        if word_tokens != []:
            data_word[i].append(word_tokens)  

In [6]:
#building the vacabulary
vocab_to_int = build_vocab(data_word)

#get the list which is the maxim quantity of sentence'
#get the padding element
maxlen_word = 0
maxlen_sent = 0

list_maxlen_sent = []
list_maxlen_word = []
for i in range(len(data_sent)):
    list_maxlen_sent.append((len(data_sent[i])))

#get the list which is the maxim quantity of word
for i in range(len(data_sent)):
    for j in range(len(data_sent[i])):
        list_maxlen_word.append(len(data_sent[i][j]))

#get the max sentence
list_maxlen_sent = sorted(list_maxlen_sent)
maxlen_sent = list_maxlen_sent[int(len(list_maxlen_sent)*0.95)]

#get the max words
list_maxlen_word1 = sorted(list_maxlen_word)
maxlen_word = list_maxlen_word[int(len(list_maxlen_word1)*0.95)]


#start to pad
copus_padded = build_input_data(corpus=data_word,max_sents=maxlen_sent,max_words=maxlen_word,vocab2int=vocab_to_int)
# test_copus_padded = build_input_data(corpus=test_data_word,max_sents=maxlen_sent,max_words=maxlen_word,vocab2int=vocab_to_int)

In [7]:
file = open('pickle_New_Data/vocab_to_int.pickle','wb')     
pickle.dump(vocab_to_int,file)
file.close()

In [8]:
#split the data to train and test dataset
train_copus_padded, test_copus_padded, train_label, test_label = train_test_split(
copus_padded, train_label, test_size=0.2, random_state=42)

In [21]:
# # get the embedding matrix
# # the number of the vocabulary is 100327
# # 把每个词映射到一个300维度的vector
# # 这个matrix是二维的
# # 用vocab2int中每个词对应的整数来去matrix来找对应的vector
# dimension = 300
# path = 'E:/code_stock/SA/data/GoogleNews-vectors-negative300.bin'
# embedding_matrix = load_embedding_matrix_gensim(embed_path = path,vocab2int=vocab_to_int,EMBEDDING_DIM=dimension)

# #use pickle to store the data
# file = open('pickle_New_Data/embedding_matrix.pickle','wb')     
# pickle.dump(embedding_matrix,file)
# file.close()

In [9]:
file = open('pickle_New_Data/CNN_Weights.pickle','rb')
CNN_Weights = pickle.load(file)
file.close()

file = open('pickle_New_Data/embedding_matrix.pickle','rb')
embedding_matrix = pickle.load(file)
file.close()

In [10]:
#the shape of the data
print('train data shape:',train_copus_padded.shape)
print('test data shape:',test_copus_padded.shape)
print('embedding_matrix shape:', embedding_matrix.shape)
#the size of vocabulary
vocab_size = len(vocab_to_int)
print('vocabulary size:', vocab_size)
# the maximal length of every sentence
maxlen_sent = train_copus_padded.shape[1]
maxlen_word = train_copus_padded.shape[2]
print('max sent length:', maxlen_sent, '\nmax word length:', maxlen_word)

train data shape: (1600, 13, 23)
test data shape: (400, 13, 23)
embedding_matrix shape: (8023, 300)
vocabulary size: 8023
max sent length: 13 
max word length: 23


## building hierarchical models 

In [11]:
import pickle
import keras
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda
from keras.layers.embeddings import Embedding
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D, Input, Dense, Reshape, LSTM, GRU, Bidirectional, TimeDistributed
from keras.layers.merge import concatenate, add
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras import initializers
from keras.engine.topology import Layer
from keras import backend as K
import tensorflow as tf
import numpy as np
from util.util_functions import getWordIdx
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

In [12]:
class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [13]:
# hyper-parameters
gru_dim = 128
dropout_rate = 0.3
atten_dim = 50

batch_size = 50
epoch_num = 15

categorical_label = True

if categorical_label:
    train_label_cat = np_utils.to_categorical(train_label)
    test_label_cat = np_utils.to_categorical(test_label)

In [35]:
# # hyper-parameters in IMDB
# gru_dim = 128
# dropout_rate = 0.2
# atten_dim = 100

# batch_size = 100
# epoch_num = 15

# categorical_label = True

# if categorical_label:
#     train_label_cat = np_utils.to_categorical(train_label)
#     test_label_cat = np_utils.to_categorical(test_label)

## CNN+biGRU 

In [48]:
# define some Keras layers
embedding_layer = Embedding(vocab_size, embedding_matrix.shape[1], input_length= maxlen_word, 
                            weights=[embedding_matrix], trainable=False)

cnn_layer = Convolution1D(nb_filter=100,
                            filter_length=3,
                            border_mode='same',
                            activation='tanh',
                            subsample_length=1)

rnn_layer = Bidirectional(GRU(gru_dim, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True))
#embedding matrix shape[1]是300，每个vector的维度
max_pooling_layer = GlobalMaxPooling1D()


# build sentence encoder model
sentence_input = Input(shape=(maxlen_word,), dtype='int32')

sent_embedding = embedding_layer(sentence_input)  #input shape:(MAX_SENT_LENGTH),output shape:(MAX_SENT_LENGTH,embed dimension)

sent_cnn = cnn_layer(sent_embedding) # output shape: (None, maxlen_word, nb_filter)
# we use standard max over time pooling
sent_cnn = max_pooling_layer(sent_cnn)  # output shape: (None, nb_filter)


sentEncoder = Model(sentence_input, sent_cnn)
sentEncoder.summary()

# build document encoder model
review_input = Input(shape=(maxlen_sent, maxlen_word), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)   # out shape: (None, MAX_SENTS, nb_filter)

rnn_out = rnn_layer(review_encoder) # (batch_size, timesteps, gru_dimx2)

att_out = AttLayer(atten_dim)(rnn_out)

if categorical_label:
    preds = Dense(2, activation='softmax')(att_out) # categorical output
    model = Model(review_input, preds)
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
else:
    preds = Dense(1, activation='sigmoid')(att_out)
    model = Model(review_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        (None, 23)                0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 23, 300)           2406900   
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 23, 100)           90100     
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 100)               0         
Total params: 2,497,000
Trainable params: 90,100
Non-trainable params: 2,406,900
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        (None, 13, 23)            0         
_____________________________________________________________

In [49]:
print('Training...')
for i in range(epoch_num):
    print('Training for epoch {}/{}'.format(i+1,epoch_num))
    if categorical_label:
        model.fit(train_copus_padded, train_label_cat, batch_size=batch_size,epochs=1)
    else:
        model.fit(train_copus_padded, train_label, batch_size=batch_size,epochs=1)
        
    print('Evaluating...')
    pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
    # predict the class label
    if pred_test_prob.shape[-1]>1:
        pred_test = pred_test_prob.argmax(axis=-1)
    else:
        pred_test = (pred_test_prob>0.5).astype('int32')
        pred_test = pred_test.reshape(pred_test.shape[0])

    acc = np.sum(pred_test == test_label) / float(len(test_label))

    print("Accuracy: %.4f" % (acc))   
    print(classification_report(test_label, pred_test, digits=4, labels=[0, 1]))

Training...
Training for epoch 1/15
Epoch 1/1
Evaluating...
Accuracy: 0.7625
              precision    recall  f1-score   support

           0     0.7008    0.9204    0.7957       201
           1     0.8824    0.6030    0.7164       199

   micro avg     0.7625    0.7625    0.7625       400
   macro avg     0.7916    0.7617    0.7561       400
weighted avg     0.7911    0.7625    0.7563       400

Training for epoch 2/15
Epoch 1/1
Evaluating...
Accuracy: 0.8300
              precision    recall  f1-score   support

           0     0.7930    0.8955    0.8411       201
           1     0.8786    0.7638    0.8172       199

   micro avg     0.8300    0.8300    0.8300       400
   macro avg     0.8358    0.8297    0.8292       400
weighted avg     0.8356    0.8300    0.8292       400

Training for epoch 3/15
Epoch 1/1
Evaluating...
Accuracy: 0.8700
              precision    recall  f1-score   support

           0     0.8371    0.9204    0.8768       201
           1     0.9106    0.8

## K-CNN+biGRU

In [30]:
# define some Keras layers
embedding_layer = Embedding(vocab_size, embedding_matrix.shape[1], input_length=maxlen_word, 
                            weights=[embedding_matrix], trainable=False)

cnn_layer1 = Convolution1D(nb_filter=50,
                            filter_length=3,
                            border_mode='same',
                            activation='tanh',
                            subsample_length=1)

cnn_layer2 = Convolution1D(nb_filter=CNN_Weights[0].shape[2],
                            filter_length=1,
                            border_mode='same',
                            activation='tanh',
                           weights = CNN_Weights,
                           trainable = False,
                            subsample_length=1)

rnn_layer = Bidirectional(GRU(gru_dim, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True))
# rnn_layer = GRU(gru_dim, dropout=dropout_rate, recurrent_dropout=dropout_rate, return_sequences=True)

max_pooling_layer = GlobalMaxPooling1D()



In [31]:
# build sentence encoder model
sentence_input = Input(shape=(maxlen_word,), dtype='int32')

sent_embedding = embedding_layer(sentence_input)  #input shape:(MAX_SENT_LENGTH),output shape:(MAX_SENT_LENGTH,embed dimension)

sent_cnn1 = cnn_layer1(sent_embedding) # output shape: (None, maxlen_word, nb_filter)
# we use standard max over time pooling
sent_cnn1 = max_pooling_layer(sent_cnn1)  # output shape: (None, nb_filter)

sent_cnn2 = cnn_layer2(sent_embedding) # output shape: (None, maxlen_word, nb_filter)
# we use standard max over time pooling
sent_cnn2 = max_pooling_layer(sent_cnn2)  # output shape: (None, nb_filter)

sent_cnn = concatenate([sent_cnn1, sent_cnn2])

sentEncoder = Model(sentence_input, sent_cnn)
sentEncoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 23)           0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 23, 300)      2406900     input_9[0][0]                    
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 23, 50)       45050       embedding_5[0][0]                
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 23, 533)      160433      embedding_5[0][0]                
__________________________________________________________________________________________________
global_max

In [32]:
# build document encoder model
review_input = Input(shape=(maxlen_sent, maxlen_word), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)   # out shape: (None, MAX_SENTS, nb_filter)

rnn_out = rnn_layer(review_encoder) # (batch_size, timesteps, gru_dimx2)

att_out = AttLayer(atten_dim)(rnn_out)


if categorical_label:
    preds = Dense(2, activation='softmax')(att_out) # categorical output
    model = Model(review_input, preds)
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
else:
    preds = Dense(1, activation='sigmoid')(att_out)
    model = Model(review_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 13, 23)            0         
_________________________________________________________________
time_distributed_5 (TimeDist (None, 13, 583)           2612383   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 13, 256)           546816    
_________________________________________________________________
att_layer_5 (AttLayer)       (None, 256)               12900     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 514       
Total params: 3,172,613
Trainable params: 605,280
Non-trainable params: 2,567,333
_________________________________________________________________


In [33]:
print('Training...')
for i in range(epoch_num):
    print('Training for epoch {}/{}'.format(i+1,epoch_num))
    if categorical_label:
        model.fit(train_copus_padded, train_label_cat, batch_size=batch_size,epochs=1)
    else:
        model.fit(train_copus_padded, train_label, batch_size=batch_size,epochs=1)
        
    print('Evaluating...')
    pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
    # predict the class label
    if pred_test_prob.shape[-1]>1: 
        
        
        pred_test = pred_test_prob.argmax(axis=-1)
    else:
        pred_test = (pred_test_prob>0.5).astype('int32')
        pred_test = pred_test.reshape(pred_test.shape[0])
    acc = np.sum(pred_test == test_label) / float(len(test_label))

    print("Accuracy: %.4f" % (acc))   
    print(classification_report(test_label, pred_test, digits=4, labels=[0, 1]))

Training...
Training for epoch 1/15
Epoch 1/1
Evaluating...
Accuracy: 0.6200
              precision    recall  f1-score   support

           0     0.7634    0.3532    0.4830       201
           1     0.5765    0.8894    0.6996       199

   micro avg     0.6200    0.6200    0.6200       400
   macro avg     0.6700    0.6213    0.5913       400
weighted avg     0.6705    0.6200    0.5908       400

Training for epoch 2/15
Epoch 1/1
Evaluating...
Accuracy: 0.6525
              precision    recall  f1-score   support

           0     0.6115    0.8458    0.7098       201
           1     0.7459    0.4573    0.5670       199

   micro avg     0.6525    0.6525    0.6525       400
   macro avg     0.6787    0.6515    0.6384       400
weighted avg     0.6784    0.6525    0.6388       400

Training for epoch 3/15
Epoch 1/1
Evaluating...
Accuracy: 0.6625
              precision    recall  f1-score   support

           0     0.9583    0.3433    0.5055       201
           1     0.5976    0.9

## biGRU+CNN

In [50]:
# define some Keras layers
embedding_layer = Embedding(vocab_size, embedding_matrix.shape[1], input_length= maxlen_word, 
                            weights=[embedding_matrix], trainable=False)

cnn_layer = Convolution1D(nb_filter=100,
                            filter_length=3,
                            border_mode='same',
                            activation='tanh',
                            subsample_length=1)

rnn_layer = Bidirectional(GRU(gru_dim, dropout=dropout_rate, recurrent_dropout=dropout_rate))
#embedding matrix shape[1]是300，每个vector的维度
max_pooling_layer = GlobalMaxPooling1D()


# build sentence encoder model
sentence_input = Input(shape=(maxlen_word,), dtype='int32')

sent_embedding = embedding_layer(sentence_input)  #input shape:(MAX_SENT_LENGTH),output shape:(MAX_SENT_LENGTH,embed dimension)

sent_biGRU = rnn_layer(sent_embedding)

sentEncoder = Model(sentence_input, sent_biGRU)
sentEncoder.summary()

# build document encoder model
review_input = Input(shape=(maxlen_sent, maxlen_word), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)   # out shape: (None, MAX_SENTS, nb_filter)

cnn_out = cnn_layer(review_encoder) # (batch_size, timesteps, gru_dimx2)

cnn_out = max_pooling_layer(cnn_out)

if categorical_label:
    preds = Dense(2, activation='softmax')(cnn_out) # categorical output
    model = Model(review_input, preds)
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
else:
    preds = Dense(1, activation='sigmoid')(cnn_out)
    model = Model(review_input, preds)
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    
model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        (None, 23)                0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 23, 300)           2406900   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 256)               329472    
Total params: 2,736,372
Trainable params: 329,472
Non-trainable params: 2,406,900
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        (None, 13, 23)            0         
_________________________________________________________________
time_distributed_12 (TimeDis (None, 13, 256)           2736372   
____________________________________________________________

In [51]:
print('Training...')
for i in range(epoch_num):
    print('Training for epoch {}/{}'.format(i+1,epoch_num))
    if categorical_label:
        model.fit(train_copus_padded, train_label_cat, batch_size=batch_size,epochs=1)
    else:
        model.fit(train_copus_padded, train_label, batch_size=batch_size,epochs=1)
        
    print('Evaluating...')
    pred_test_prob = model.predict(test_copus_padded, batch_size=batch_size, verbose=True)
    # predict the class label
    if pred_test_prob.shape[-1]>1:
        pred_test = pred_test_prob.argmax(axis=-1)
    else:
        pred_test = (pred_test_prob>0.5).astype('int32')
        pred_test = pred_test.reshape(pred_test.shape[0])

    acc = np.sum(pred_test == test_label) / float(len(test_label))

    print("Accuracy: %.4f" % (acc))   
    print(classification_report(test_label, pred_test, digits=4, labels=[0, 1]))

Training...
Training for epoch 1/15
Epoch 1/1
Evaluating...
Accuracy: 0.7225
              precision    recall  f1-score   support

           0     0.7885    0.6119    0.6891       201
           1     0.6803    0.8342    0.7494       199

   micro avg     0.7225    0.7225    0.7225       400
   macro avg     0.7344    0.7231    0.7193       400
weighted avg     0.7347    0.7225    0.7191       400

Training for epoch 2/15
Epoch 1/1
Evaluating...
Accuracy: 0.7850
              precision    recall  f1-score   support

           0     0.8177    0.7363    0.7749       201
           1     0.7580    0.8342    0.7943       199

   micro avg     0.7850    0.7850    0.7850       400
   macro avg     0.7878    0.7852    0.7846       400
weighted avg     0.7880    0.7850    0.7845       400

Training for epoch 3/15
Epoch 1/1
Evaluating...
Accuracy: 0.8175
              precision    recall  f1-score   support

           0     0.8596    0.7612    0.8074       201
           1     0.7838    0.8