# LSTM Sentiment Analysis DEMO

In [40]:
import warnings

import numpy as np
import jieba

from typing import List

from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.corpora.dictionary import Dictionary

from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout, Activation
from keras.utils.np_utils import to_categorical

warnings.filterwarnings('ignore')

In [4]:
def build_word_vec(text_file, save_file, user_dicts=None, drop=None, size=128, **kwargs):
    """
    Train w2v model based on th etext file and store to the local system
    """
    if not drop:
        drop = [" "]
        
    if user_dicts:
        for ud in user_dicts:
            jieba.load_userdict(ud)
    
    if isinstance(text_file, str):
        with open(text_file, "rb") as f:
            doc = f.readlines()
    else:
        doc = text_file
    
    sentences = []
    for line in doc:
        words = jieba.lcut(line.strip())
        words = [w for w in words if w not in drop]
        sentences.append(words)
            
    model = Word2Vec(sentences, size=size, **kwargs)
    model.wv.save_word2vec_format(save_file, binary=True)
    
    return model

def load_w2v(w2v_file):
    """
    load w2v file and return gensim.models.word2vec.Word2Vec object
    """
    
    return KeyedVectors.load_word2vec_format(w2v_file, binary=True)


In [5]:
def gen_w2ix(w2v_model):
    """
    Create a dictionary
    """
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(w2v_model.wv.vocab.keys(), allow_update=True)
    w2ix = {v: k + 1 for k, v in gensim_dict.items()}
    
    return w2ix

In [6]:
def get_ix_vec(sents: List[List[str]], w2ix):
    """
    Transfer a file to index array
    """
    new_sentences = []
    for sen in sents:
        new_sen = []
        for word in sen:
            try:
                new_sen.append(w2ix[word])
            except:
                new_sen.append(0)
        new_sentences.append(np.array(new_sen))

    return np.array(new_sentences)

In [7]:
def gen_w2ix_weight(index_dic, w2v_model):
    """
    Generate weights from w2v corresponding to the dictionary
    """
    weights = np.zeros((len(index_dic)+1, w2v_model.vector_size))
    for w, index in index_dic.items():
        weights[index, :] = w2v_model[w]
    
    return weights

In [8]:
# Define the layer of LSTM
def train_lstm(embedding_weights, x_train, y_train, x_test, y_test, **kwargs):
    """"""
    
    print ('Creating a model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim = 128,
                        input_dim = W2IX_DIM,
                        mask_zero = True,
                        weights = [embedding_weights],
                        input_length = INPUT_LEN,
                       ))  # Adding Input Length
    model.add(LSTM(input_dim=128, 
                   output_dim = kwargs.get('lstm_out_dim', 64), 
                   activation = kwargs.get('lstm_actv', 'tanh'),
                   dropout=kwargs.get('lstm_drop_out', .2)))
    model.add(Dropout(kwargs.get('drop_out', .3)))
    model.add(Dense(output_dim=N_CLASS, activation=kwargs.get('dens_actv', 'softmax')))
    model.add(Activation('tanh'))

    print ('Compiling...)
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adam',
                  metrics = ['accuracy'])

    print ("Training...")
    model.fit(x_train, y_train, batch_size = BATCH_SIZE, nb_epoch = EPOCH, validation_data = (x_test, y_test))

    print ("Evaluating...")
    score, acc = model.evaluate(x_test, y_test, batch_size = BATCH_SIZE)
    print ('Test score: %.3f' % score)
    print ('Test accuracy: %.3f' % acc)
    return model, kwargs.get('dens_actv', 'softmax')

---

## Demo 1 Sentiment analysis of online shopping evaluation (Binary)

In [9]:
# load dataset
import copy

with open('./data/comments/pos.txt','r', encoding='utf-8') as f:
    pos = f.readlines()
    label_pos = [1]*len(pos)
with open('./data/comments/neg.txt','r', encoding='utf-8') as f:
    neg = f.readlines()
    label_neg = [0]*len(neg)
    
all_txt = copy.copy(pos)
all_txt.extend(neg)

all_lable = copy.copy(label_pos)
all_lable.extend(label_neg)

In [10]:
# Trainw2v
# w2v = build_word_vec(all_txt, "data/comments/comments_w2v.bin", min_count=5, window=5)
w2v = load_w2v("data/comments/comments_w2v.bin")
w2ix = gen_w2ix(w2v)

In [11]:
# Define global variables
N_CLASS = 2
W2IX_DIM = len(w2ix) + 1
EPOCH = 5
BATCH_SIZE = 32

In [12]:
# Prepare data
comments = get_ix_vec(all_txt, w2ix)
weights = gen_w2ix_weight(w2ix, w2v.wv)

INPUT_LEN = max([len(l) for l in comments])

train_x, test_x, train_y, test_y = train_test_split(comments, np.array(all_lable))
train_x = sequence.pad_sequences(train_x, INPUT_LEN)
test_x = sequence.pad_sequences(test_x, INPUT_LEN)
train_y = to_categorical(train_y,num_classes = N_CLASS)
test_y = to_categorical(test_y,num_classes = N_CLASS)

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(len(train_x)), "\nTest set: \t\t{}".format(len(test_x)))

			Feature Shapes:
Train set: 		15843 
Test set: 		5281


In [13]:
# Train the model
fitted = train_lstm(weights, train_x, train_y, test_x, test_y)

In [379]:
# 测试效果
# pos 热水器挺好的，比实体店优惠多了，关键是好用，发货速度 物流速度都快，还有赠品
# neg 总的感觉前言不搭后语，浪费了银子，呜呜
t_txt = "总的感觉前言不搭后语，浪费了银子，呜呜"
test = sequence.pad_sequences([get_ix_vec([t_txt], w2ix)[0]], 45)
fitted.predict_classes(test)

array([0])

---

## DEMO 2 Sentiment Analysis of financial news headlines (Three categories)

In [9]:
# Train w2v
sentences = []
for i in [1,2,3]:
    with open('./data/news/doc_p'+str(i)+'.txt','r',encoding = 'utf-8') as f:
        doc = f.readlines()
        doc = [d for d in doc if len(d) > 20]
    lines = []
    for l in doc:
        words = jieba.lcut(l.strip())
        words = [w for w in words if w != " "]
        lines.append(words)
    sentences.extend(lines)
    del lines, doc

model = Word2Vec(sentences, size=128, min_count=10, window=5)
model.wv.save_word2vec_format("./data/news/news_w2v.bin", binary=True)
del sentences

In [42]:
# laod title and article
with open('./data/news/titles_sign+3.txt','r',encoding = 'utf-8') as f:
    titles = f.read()
with open('./data/news/sentiments_sign+3.txt','r',encoding = 'utf-8') as f:
    sentiments = f.read()

from string import punctuation
symbols = '！；：？【】★■●↑“”，。、~@#￥%《》……&*（）0123456789' + punctuation
titles = ''.join([c for c in titles  if c not in symbols])

all_txt = titles.split('\t')[:-1]    
all_lable = sentiments.split("\t")[:-1]

In [43]:
w2v = load_w2v("data/news/news_w2v.bin")
w2ix = gen_w2ix(w2v)

In [44]:
# Define global variables
N_CLASS = 3
W2IX_DIM = len(w2ix) + 1
EPOCH = 5
BATCH_SIZE = 32

In [45]:
# Prepare the data
titles = get_ix_vec(all_txt, w2ix)
weights = gen_w2ix_weight(w2ix, w2v.wv)

INPUT_LEN = max([len(l) for l in titles])

train_x, test_x, train_y, test_y = train_test_split(titles, np.array(all_lable))
train_x = sequence.pad_sequences(train_x, INPUT_LEN)
test_x = sequence.pad_sequences(test_x, INPUT_LEN)
train_y = to_categorical(train_y,num_classes = N_CLASS)
test_y = to_categorical(test_y,num_classes = N_CLASS)

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(len(train_x)), "\nTest set: \t\t{}".format(len(test_x)))

			Feature Shapes:
Train set: 		1450 
Test set: 		484


In [1]:
# Train the model
fitted = train_lstm(weights, train_x, train_y, test_x, test_y, dens_actv='sigmoid')[0]

In [60]:
# Test the result
# pos 凯撒旅游与中国光大银行深化合作 打开全渠道发展空间
# neg 小康股份收购美国电池公司，遭上交所问询
# neu 龙韵股份05月29日资金揭秘
t_txt = "贵州茅台已经“失控”国家队都拯救不了"
test = sequence.pad_sequences([get_ix_vec([t_txt], w2ix)[0]], INPUT_LEN)
fitted.predict_classes(test)

array([2])