In [5]:
import numpy as np 
import pandas as pd 
import nltk
import os
import gc

In [6]:
train_data = pd.read_csv("data/train.tsv", sep="\t") # 训练集
test_data = pd.read_csv("data/test.tsv", sep="\t") # 测试集

In [7]:
train_data.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [8]:
test_data.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [9]:
print('训练集中phrases数量: {}. 训练集中sentences 数量: {}.'.format(train_data.shape[0], len(train_data.SentenceId.unique())))
print('测试集中phrases数量: {}. 测试集中sentences 数量: {}.'.format(test_data.shape[0], len(test_data.SentenceId.unique())))

训练集中phrases数量: 156060. 训练集中sentences 数量: 8529.
测试集中phrases数量: 66292. 测试集中sentences 数量: 3310.


In [10]:
# 按照SentenceId进行分组
print('训练集train_data中平均每个sentences的phrases数量：{0:.0f}.'.format(train_data.groupby('SentenceId')['Phrase'].count().mean()))
print('测试集test_data中平均每个sentences的phrases数量：{0:.0f}.'.format(test_data.groupby('SentenceId')['Phrase'].count().mean()))

训练集train_data中平均每个sentences的phrases数量：18.
测试集test_data中平均每个sentences的phrases数量：20.


In [11]:
# 查看positive的短语中不同三元组的出现频数
text = ' '.join(train_data['Phrase'].loc[train_data.SentenceId == 4])
print(text)

A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder A positively thrilling combination positively thrilling combination positively thrilling combination thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder ethnography and all the intrigue , betrayal , deceit and murder ethnography and ethnography all the intrigue , betrayal , deceit and murder all the intrigue , betrayal , deceit and murder intrigue , betrayal , deceit and murder intrigue , betrayal , deceit and murder betrayal , deceit and murder betrayal , deceit and murder deceit and murder deceit and deceit murder of a Shakespearean tragedy or a jui

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

In [13]:
# 将所有Phrase存入一个列表
full_text = list(train_data['Phrase']) + list(test_data['Phrase'])
print(full_text[:50])

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', 'A series of escapades demonstrating the adage that what is good for the goose', 'A series', 'A', 'series', 'of escapades demonstrating the adage that what is good for the goose', 'of', 'escapades demonstrating the adage that what is good for the goose', 'escapades', 'demonstrating the adage that what is good for the goose', 'demonstrating the adage', 'demonstrating', 'the adage', 'the', 'adage', 'that what is good for the goose', 'that', 'what is good for the goose', 'what', 'is good for the goose', 'is', 'good for the goose', 'good', 'for the goose', 'for', 'the goose', 'goose', 'is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', 'is also good for the gander , some of which occasionally amuses but none of which amounts to much of a s

In [14]:
# 向量化文本，将文本转换成序列
tk = Tokenizer(lower = True, filters='') # 初始化分词器
tk.fit_on_texts(full_text) # 对整个文本进行拟合
train_tokenized = tk.texts_to_sequences(train_data['Phrase']) # 将训练集文本进行序列化
test_tokenized = tk.texts_to_sequences(test_data['Phrase'])

In [15]:
# 进行补长处理
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
print(X_train[:5])
X_test = pad_sequences(test_tokenized, maxlen = max_len)

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     3   308     4 18264  7830     1  8332    11    55    10    51
     15     1  4669    10   185    51    15     1 14845     2    64     4
     91   560 13389    21   610     4    91  2702     6    54     4     3
     44     7]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      3   308     4 18264  7830     1  8332    11    55    10    51    15
      1  4669]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      3   308]
 [    0     0     0     0     0     0     0     0     0     0     0

根据预训练的词向量文件初始化词向量矩阵（与之前写过的ChineseNER_keras中的keras_lstm_test中的做法类似），先根据预训练的词向量文件获取字典embedding_index，键值对为word-vector，然后根据word_index（词与索引之间的映射字典）与embedding_index得到字典embedding_matrix，键值对为index-vector。

In [16]:
embedding_path = "data/glove.6B.100d.txt"
embed_size = 100
max_features = 20000 # embedding_matrix的维度，即需要根据预训练词向量进行初始化的单词数量
def get_vector(word, *arr):
    return word, np.asarray(arr, dtype="float32")
embedding_index = dict(get_vector(*word_vector.strip().split(" ")) for word_vector in open(embedding_path, encoding="utf-8"))
print("great:", embedding_index["great"])

great: [-0.013786   0.38216    0.53236    0.15261   -0.29694   -0.20558
 -0.41846   -0.58437   -0.77355   -0.87866   -0.37858   -0.18516
 -0.128     -0.20584   -0.22925   -0.42599    0.3725     0.26077
 -1.0702     0.62916   -0.091469   0.70348   -0.4973    -0.77691
  0.66045    0.09465   -0.44893    0.018917   0.33146   -0.35022
 -0.35789    0.030313   0.22253   -0.23236   -0.19719   -0.0053125
 -0.25848    0.58081   -0.10705   -0.17845   -0.16206    0.087086
  0.63029   -0.76649    0.51619    0.14073    1.019     -0.43136
  0.46138   -0.43585   -0.47568    0.19226    0.36065    0.78987
  0.088945  -2.7814    -0.15366    0.01015    1.1798     0.15168
 -0.050112   1.2626    -0.77527    0.36031    0.95761   -0.11385
  0.28035   -0.02591    0.31246   -0.15424    0.3778    -0.13599
  0.2946    -0.31579    0.42943    0.086969   0.019169  -0.27242
 -0.31696    0.37327    0.61997    0.13889    0.17188    0.30363
 -1.2776     0.044423  -0.52736   -0.88536   -0.19428   -0.61947
 -0.10146   -0.

In [17]:
word_index = tk.word_index
print("great:", word_index["great"])
num_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((num_words+1, embed_size))
for word, i in word_index.items():
    # word的索引是根据词频进行排列的，所以索引大于预设的max_features时，不进行初始化
    if i >= max_features:
        continue
    #get(word)替代[i],遇到key不存在不会报异常，而是直接返回None
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(embedding_matrix[word_index["great"]])

great: 134
[-0.013786    0.38216001  0.53236002  0.15261    -0.29694    -0.20558
 -0.41846001 -0.58437002 -0.77354997 -0.87866002 -0.37858    -0.18516
 -0.12800001 -0.20584001 -0.22925    -0.42598999  0.3725      0.26076999
 -1.07019997  0.62915999 -0.091469    0.70348001 -0.4973     -0.77691001
  0.66044998  0.09465    -0.44893     0.018917    0.33146    -0.35021999
 -0.35789001  0.030313    0.22253001 -0.23236001 -0.19719    -0.0053125
 -0.25848001  0.58081001 -0.10705    -0.17845    -0.16205999  0.087086
  0.63028997 -0.76648998  0.51618999  0.14072999  1.01900005 -0.43136001
  0.46138    -0.43584999 -0.47567999  0.19226     0.36065     0.78987002
  0.088945   -2.78139997 -0.15366     0.01015     1.17980003  0.15167999
 -0.050112    1.26259995 -0.77526999  0.36030999  0.95761001 -0.11385
  0.28035    -0.02591     0.31246001 -0.15424     0.37779999 -0.13598999
  0.29460001 -0.31579     0.42943001  0.086969    0.019169   -0.27241999
 -0.31696001  0.37327     0.61997002  0.13889     0.

In [18]:
# 对label进行one-hot处理
y = train_data["Sentiment"]
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse=False)
y_onehot = onehot.fit_transform(y.reshape(-1, 1))

In [21]:
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping
file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (max_len,))
    x = Embedding(19479, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x_gru = Bidirectional(GRU(units, return_sequences = True))(x1)
    x1 = Conv1D(32, kernel_size=3, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(32, kernel_size=2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(LSTM(units, return_sequences = True))(x1)
    x1 = Conv1D(32, kernel_size=3, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(32, kernel_size=2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    # keras的concatenate类似于TensorFlow的concat，将卷积层的输出拼接再一起作为最终的输出
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(0.2)(Dense(128,activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(0.2)(Dense(100,activation='relu') (x))
    x = Dense(5, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    print("compile end")
    history = model.fit(X_train, y_onehot, batch_size = 128, epochs = 15, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])
    print("train end")
    model = load_model(file_path)
    return model

In [None]:
model = build_model(lr = 1e-4, lr_d = 0, units = 128, dr = 0.5)

compile end
Train on 140454 samples, validate on 15606 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15

In [1]:
pred = model.predict(X_test, batch_size = 1024)

NameError: name 'model' is not defined