In [62]:
import pandas as pd 
import numpy as np 
import multiprocessing
import warnings;warnings.filterwarnings('ignore')

from nltk.tokenize import WordPunctTokenizer
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
from keras import utils as np_utils
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation, Flatten


import sys
sys.setrecursionlimit(1000000)


In [63]:
# set parameters:
cpu_count = multiprocessing.cpu_count()-1  #  cpu工作核数
vocab_dim = 100
n_iterations = 1                           #  W2V 随机梯度下降法中迭代的最大次数
n_exposures = 2                            # 最小词频
window_size = 5                            # 词向量上下文最大距离
n_epoch = 4
input_length = 110
maxlen = 110
batch_size = 32


n_exposures = 7           最小词频           看一下字典大小

In [64]:
# load file
train_data = pd.read_csv('data/train.csv',lineterminator = '\n')
test_data  = pd.read_csv('data/test.csv',lineterminator = '\n')
    
new_test_data  = test_data['review'].str.lower()
testdata = new_test_data.as_matrix()                  #np.array(new_test_data,dtype=str)
    
new_train_data = train_data['review'].str.lower()
traindata = new_train_data .as_matrix()            #np.array(new_train_data,dtype=str)


train_data['num_label1'] = train_data.label.apply(lambda x:1 if x=="Positive" else 0 )
y = train_data['num_label1'].as_matrix()

In [65]:
#tokenizer
combined = [WordPunctTokenizer().tokenize(document.replace('\n', '')) for document in traindata]
test = [WordPunctTokenizer().tokenize(document.replace('\n', '')) for document in testdata]

### test[:2]
[['yaqoob',
  'memon',
  'ki',
  'phansi',
  'zalimana',
  'ghair',
  'insani',
  'hai',
  '20',
  'saal',
  'qaid',
  'kaat',
  'chukay',
  'thay',
  'amnesty',
  'international',
  'ki',
  'bharti',
  'iqdam',
  'ki',
  'muzammat'],
 ['sabit', 'qadam', 'rehna']]

In [66]:
def create_dictionaries(model):

    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #gensim_dict.doc2bow(model.vocab.keys(),
        #                    allow_update=True)
        #  freqxiao10->0 所以k+1
    w2indx = {v: k+1 for k, v in gensim_dict.items()}#超过频数的词语的索引,(k->v)=>(v->k)
    w2vec = {word: model[word] for word in w2indx.keys()}#超过频数词语的词向量, (word->model(word))

    print(len(w2indx))
    print(len(w2indx))
    return w2indx, w2vec


In [67]:
def parse_dataset(combined,index_dict):
    data=[]
    for sentence in combined:
        new_txt = []
        for word in sentence:
            try:
                new_txt.append(index_dict[word])
            except:
                new_txt.append(0) # freqxiao10->0
        data.append(new_txt)
    return data # word=>index

In [68]:
def word2vec_train(combined):

    model = Word2Vec(size=vocab_dim, min_count=n_exposures, window=window_size,
                     workers=cpu_count, iter=n_iterations)
    model.build_vocab(combined)    # input: list
    model.train(combined,total_examples=model.corpus_count, epochs=model.iter)#model.train(combined)
    index_dict, word_vectors= create_dictionaries(model=model)
    return   index_dict, word_vectors


index_dict, word_vectors =word2vec_train(combined)


7354
7354


In [69]:
combined = parse_dataset(combined, index_dict)
test = parse_dataset(test, index_dict)

# n=1
[[0,
  0,
  2460,
  3413,
  0,
  1532,
  1951,
  1643,
  94,
  3773,
  3540,
  0,
  0,
  4306,
  0,
  1966,
  2460,
  782,
  0,
  2460,
  0],
 [3785, 3536, 3688]]

# n=2
[[0,
  0,
  2460,
  3413,
  0,
  1532,
  1951,
  1643,
  94,
  3773,
  3540,
  0,
  0,
  4306,
  0,
  1966,
  2460,
  782,
  0,
  2460,
  0],
 [3785, 3536, 3688]]

In [70]:
combined = sequence.pad_sequences(combined, 
                                  maxlen=maxlen)#每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
test = sequence.pad_sequences(test, maxlen=maxlen)
test[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 4462, 3760, 5262,    0, 2355, 2991, 2542,  122, 5815,
        5467,    0,    0, 6680,    0, 3014, 3760, 1172,    0, 3760,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,

In [71]:
def get_data(index_dict,word_vectors,combined,y):

    n_symbols = len(index_dict) + 1        # 所有单词的索引数，频数小于10的词语索引为0，所以加1
    embedding_weights = np.zeros((n_symbols, vocab_dim)) # 初始化 索引为0的词语，词向量全为0
    for word, index in index_dict.items():               # 从索引为1的词语开始，对每个词语对应其词向量
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0)

    y_train = np_utils.to_categorical(y_train,num_classes=2)
    y_test = np_utils.to_categorical(y_test,num_classes=2)
    # print (x_train.shape,y_train.shape)
    return n_symbols,embedding_weights,x_train,y_train,x_test,y_test


n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, 
                                                                   word_vectors,combined,y)

In [72]:
model = Sequential()  # or Graph or whatever

model.add(Embedding(output_dim=vocab_dim,input_dim=n_symbols, mask_zero=True, weights=[embedding_weights],
                    input_length=input_length))  # Adding Input Length

model.add(LSTM(50, activation='tanh'))
model.add(Dropout(0.3))                                   ##原0.5
#model.add(Flatten())
model.add(Dense(2, activation='softmax')) # Dense=>全连接层,输出维度=3
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
#model.add(Activation('sigmoid'))

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 110, 100)          735500    
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_5 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 102       
Total params: 765,802
Trainable params: 765,802
Non-trainable params: 0
_________________________________________________________________
None


In [73]:
model.fit(x_train, y_train, batch_size=batch_size, epochs = n_epoch, 
          verbose=1)# 0：不输出日志信息，1：输出进度条记录，2：每个epoch输出一行记录

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1a381141b38>

In [74]:
score = model.evaluate(x_test, y_test,batch_size=batch_size)
score

[]

lstm: relu  epoch =4 [0.5150559049741164, 0.7519747231620559]   
lstm: relu  epoch =5 [0.8224221409591265, 0.7519747240095154]

In [75]:
y_lstm = model.predict_proba(test,batch_size=batch_size)[:,1]
lstm_output = pd.DataFrame(data={"ID":test_data["ID"], "Pred":y_lstm})
lstm_output.to_csv('lstm_new.csv', index = False, quoting = 2)

In [35]:
?pd.Series.to_csv()