In [13]:
import sys
import json
import multiprocessing

import yaml
import pandas as pd
import numpy as np
import jieba
import keras
from keras.preprocessing import sequence
# MODIFY: model_from_yaml => model_from_json yaml格式因为有重大安全漏洞而被弃用
from keras.models import Sequential, model_from_json
# MODIFY: 从layers的子模块导入，修改为直接从layers导入
from keras.layers import Embedding, LSTM, Dense, Dropout, Activation
from gensim.corpora.dictionary import Dictionary
from gensim.models.word2vec import Word2Vec
# MODIFY: sklearn.cross_validation => sklearn.model_selection
from sklearn.model_selection  import train_test_split

In [2]:

neg=pd.read_csv('../data/neg.csv',header=None,index_col=None)
pos=pd.read_csv('../data/pos.csv',header=None,index_col=None, on_bad_lines='skip')
neu=pd.read_csv('../data/neutral.csv', header=None, index_col=None)

In [3]:
neu[0]

0        The thumbnails in the interface are too small ...
1                                                  So good
2        Please Take Out that Watch temporary thing it'...
3        Netflix Apni sabhi Video ko sabhi Bhashao me D...
4        Good collection of shows and films with some n...
                               ...                        
11878    Please update Netflix in which there is no scr...
11879    Please provide update information. The slightl...
11880    There is bug when i try share to instagram sto...
11881    Yeah i think it's the best app sofar it's easi...
11882    How can I delete one of the users? Please. Hel...
Name: 0, Length: 11883, dtype: object

In [4]:
# ADD: 添加填零操作，避免出现na被解析成float，导致后面的str操作失败
pos.fillna('', inplace=True)
neu.fillna('', inplace=True)
neg.fillna('', inplace=True)
combined = np.concatenate((pos[0], neu[0], neg[0]))
combined.shape, combined[0]

((112271,), 'I love this app')

In [5]:
# pos -> 1; neu -> 0; neg -> -1
y = np.concatenate(
    (np.ones(len(pos), dtype=int), 
    np.zeros(len(neu), dtype=int), 
    -1*np.ones(len(neg),dtype=int))
    )
y.shape

(112271,)

In [6]:
#对句子经行分词，并去掉换行符
def tokenizer(text):
    ''' Simple Parser converting each document to lower-case, then
        removing the breaks for new lines and finally splitting on the
        whitespace
    '''
    text = [jieba.lcut(document.replace('\n', '')) for document in text]
    return text

# # 检测是否有不为str类型的数据
# count = 0
# for document in combined:
#     if not isinstance(document, str):
#         print(document, type(document), count)
#     count += 1

combined = tokenizer(combined)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.858 seconds.
Prefix dict has been built successfully.


In [7]:
cpu_count = multiprocessing.cpu_count() # 4
vocab_dim = 100
n_iterations = 10  # ideally more..
n_exposures = 10 # 所有频数超过10的词语
window_size = 7
n_epoch = 4
input_length = 100
maxlen = 100

def create_dictionaries(model: Word2Vec=None,
                        combined=None):
    ''' 
    Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        # MODIFY: model.vocab => model.wv.key_to_index
        gensim_dict.doc2bow(model.wv.key_to_index.keys(),
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()} # 所有频数超过10的词语的索引,(k->v)=>(v->k)
        # MODIFY: model[word] => model.wv[word]
        w2vec = {word: model.wv[word] for word in w2indx.keys()} # 所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined): # 闭包-->临时使用
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) # freqxiao10->0
                data.append(new_txt)
            return data # word=>index
        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')


#创建词语字典，并返回每个词语的索引，词向量，以及每个句子所对应的词语索引
def word2vec_train(combined):
    # MODIFY: 新版本中size变成了vector_size，iter变成了epochs
    model = Word2Vec(vector_size=vocab_dim,
                     min_count=n_exposures,
                     window=window_size,
                     workers=cpu_count,
                     epochs=n_iterations)
    model.build_vocab(combined) # input: list
    # MODIFY: 新版本需要添加参数total_examples和epochs
    # model.train(combined)
    model.train(combined, total_examples=model.corpus_count, epochs=50)
    model.save('../model/Word2vec_model.pkl')
    index_dict, word_vectors, combined=create_dictionaries(model=model,combined=combined)
    return index_dict, word_vectors,combined

print('Training a Word2vec model...')
index_dict, word_vectors, combined=word2vec_train(combined)

Training a Word2vec model...


In [18]:
np.random.seed(1337)  # For Reproducibility
sys.setrecursionlimit(1000000)
batch_size = 32


def get_data(index_dict, word_vectors, combined, y):

    n_symbols = len(index_dict) + 1  # 所有单词的索引数，频数小于10的词语索引为0，所以加1
    embedding_weights = np.zeros((n_symbols, vocab_dim)) # 初始化 索引为0的词语，词向量全为0
    for word, index in index_dict.items(): # 从索引为1的词语开始，对每个词语对应其词向量
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
    y_train = keras.utils.to_categorical(y_train, num_classes=3) 
    y_test = keras.utils.to_categorical(y_test, num_classes=3)
    # print x_train.shape,y_train.shape
    return n_symbols, embedding_weights, x_train, y_train, x_test, y_test


##定义网络结构
def train_lstm(n_symbols, embedding_weights, x_train, y_train, x_test, y_test):
    print('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    # MODIFY: ouotput_dim => units
    model.add(LSTM(units=50, activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax')) # Dense=>全连接层,输出维度=1
    model.add(Activation('softmax'))

    print('Compiling the Model...')
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',metrics=['accuracy'])

    print("Train...") # batch_size=32
    model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1)

    print("Evaluate...")
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    # MODIFY: yaml格式改为json格式
    # MODIFY: yaml_string = model.to_yaml() => json_string = model.to_json()
    json_string = model.to_json()
    with open('../model/lstm.json', 'w') as outfile:
        # MODIFY: yaml.dump(yaml_string) => json.dump(json_string)
        outfile.write( json.dumps(json_string) )
    # MODIFY: 修改后缀： .h5 => .weights.h5
    model.save_weights('../model/lstm.weights.h5')
    print('Test score:', score)

print('Setting up Arrays for Keras Embedding Layer...')
n_symbols, embedding_weights, x_train, y_train, x_test, y_test = get_data(index_dict, word_vectors, combined, y)
print("x_train.shape and y_train.shape:")
print(x_train.shape, y_train.shape)
train_lstm(n_symbols, embedding_weights, x_train, y_train, x_test, y_test)

Setting up Arrays for Keras Embedding Layer...
x_train.shape and y_train.shape:
(89816, 100) (89816, 3)
Defining a Simple Keras Model...
Compiling the Model...
Train...
Epoch 1/4
[1m2807/2807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 42ms/step - accuracy: 0.7017 - loss: 0.8455
Epoch 2/4
[1m2807/2807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 42ms/step - accuracy: 0.7801 - loss: 0.7692
Epoch 3/4
[1m2807/2807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 42ms/step - accuracy: 0.7935 - loss: 0.7562
Epoch 4/4
[1m2807/2807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 42ms/step - accuracy: 0.7978 - loss: 0.7523
Evaluate...
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.7897 - loss: 0.7592
Test score: [0.7569557428359985, 0.7917612791061401]


In [23]:
"""
预测
"""

np.random.seed(1337)  # For Reproducibility
sys.setrecursionlimit(1000000)
# define parameters
maxlen = 100

def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.key_to_index.keys(),
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model.wv[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined): # 闭包-->临时使用
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) # freqxiao10->0
                data.append(new_txt)
            return data # word=>index
        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')


def input_transform(string):
    words=jieba.lcut(string)
    words=np.array(words).reshape(1,-1)
    model=Word2Vec.load('../model/Word2vec_model.pkl')
    _, _, combined=create_dictionaries(model,words)
    return combined


def lstm_predict(string):
    print('loading model......')
    # MODIFY: yaml读取修改为json读取
    # MODIFY: .yaml => .json
    with open('../model/lstm.json', 'r') as f:
        # MODIFY: yaml_string = yaml.load(f) => json_string = json.load(f)
        json_string = json.load(f)
    # MODIFY: model_from_yaml(yaml_string) => model_from_json(json_string)
    model = model_from_json(json_string)

    print('loading weights......')
    model.load_weights('../model/lstm.weights.h5')
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',metrics=['accuracy'])
    data=input_transform(string)
    data.reshape(1,-1)
    # print data
    # MODIFY: predict_classes => predict
    result = model.predict(data)
    # ADD: 添加这一行，使得 result 和之前的 predict_classes 的结果保持一致
    result = np.argmax(result, axis=1)
    print(result) # [[1]]
    if result[0]==1:
        print(string,' positive')
    elif result[0]==0:
        print(string,' neural')
    else:
        print(string,' negative')

In [27]:
# string='酒店的环境非常好，价格也便宜，值得推荐'
# string='手机质量太差了，傻逼店家，赚黑心钱，以后再也不会买了'
# string = "这是我看过文字写得很糟糕的书，因为买了，还是耐着性子看完了，但是总体来说不好，文字、内容、结构都不好"
# string = "虽说是职场指导书，但是写的有点干涩，我读一半就看不下去了！"
string = "书的质量还好，但是内容实在没意思。本以为会侧重心理方面的分析，但实际上是婚外恋内容。"
# string = "不是太好"
# string = "不错不错"
# string = "非常好非常好！！"
# string = "真的一般，没什么可以学习的"

lstm_predict(string)

loading model......
loading weights......
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326ms/step
[1]
书的质量还好，但是内容实在没意思。本以为会侧重心理方面的分析，但实际上是婚外恋内容。  positive
