In [6]:
import numpy as np 
import pandas as pd 
import nltk
import os
import gc

In [9]:
train_data = pd.read_csv("data/train.tsv", sep="\t") # 训练集
test_data = pd.read_csv("data/test.tsv", sep="\t") # 测试集

In [18]:
train_data.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [17]:
test_data.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [27]:
print('训练集中phrases数量: {}. 训练集中sentences 数量: {}.'.format(train_data.shape[0], len(train_data.SentenceId.unique())))
print('测试集中phrases数量: {}. 测试集中sentences 数量: {}.'.format(test_data.shape[0], len(test_data.SentenceId.unique())))

训练集中phrases数量: 156060. 训练集中sentences 数量: 8529.
测试集中phrases数量: 66292. 测试集中sentences 数量: 3310.


In [28]:
# 按照SentenceId进行分组
print('训练集train_data中平均每个sentences的phrases数量：{0:.0f}.'.format(train_data.groupby('SentenceId')['Phrase'].count().mean()))
print('测试集test_data中平均每个sentences的phrases数量：{0:.0f}.'.format(test_data.groupby('SentenceId')['Phrase'].count().mean()))

训练集train_data中平均每个sentences的phrases数量：18.
测试集test_data中平均每个sentences的phrases数量：20.


In [41]:
# 查看positive的短语中不同三元组的出现频数
text = ' '.join(train_data['Phrase'].loc[train_data.SentenceId == 4])
print(text)

A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder A positively thrilling combination positively thrilling combination positively thrilling combination thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder ethnography and all the intrigue , betrayal , deceit and murder ethnography and ethnography all the intrigue , betrayal , deceit and murder all the intrigue , betrayal , deceit and murder intrigue , betrayal , deceit and murder intrigue , betrayal , deceit and murder betrayal , deceit and murder betrayal , deceit and murder deceit and murder deceit and deceit murder of a Shakespearean tragedy or a jui

In [44]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

In [53]:
# 将所有Phrase存入一个列表
full_text = list(train_data['Phrase']) + list(test_data['Phrase'])
print(full_text[:50])

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', 'A series of escapades demonstrating the adage that what is good for the goose', 'A series', 'A', 'series', 'of escapades demonstrating the adage that what is good for the goose', 'of', 'escapades demonstrating the adage that what is good for the goose', 'escapades', 'demonstrating the adage that what is good for the goose', 'demonstrating the adage', 'demonstrating', 'the adage', 'the', 'adage', 'that what is good for the goose', 'that', 'what is good for the goose', 'what', 'is good for the goose', 'is', 'good for the goose', 'good', 'for the goose', 'for', 'the goose', 'goose', 'is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .', 'is also good for the gander , some of which occasionally amuses but none of which amounts to much of a s

In [54]:
# 向量化文本，将文本转换成序列
tk = Tokenizer(lower = True, filters='') # 初始化分词器
tk.fit_on_texts(full_text) # 对整个文本进行拟合
train_tokenized = tk.texts_to_sequences(train_data['Phrase']) # 将训练集文本进行序列化
test_tokenized = tk.texts_to_sequences(test_data['Phrase'])

In [57]:
# 进行补长处理
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
print(X_train[:5])
X_test = pad_sequences(test_tokenized, maxlen = max_len)

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     3   308     4 18636  7764     1  8498    11    55    10    51
     15     1  4750    10   185    51    15     1 14279     2    63     4
     90   558 13319    21   610     4    90  2721     6    54     4     3
     44     7]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      3   308     4 18636  7764     1  8498    11    55    10    51    15
      1  4750]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      3   308]
 [    0     0     0     0     0     0     0     0     0     0     0

In [None]:
# 根据预训练的词向量文件初始化词向量矩阵