In [5]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM,Dropout,Dense,Embedding,BatchNormalization,Dropout,GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam,SGD
import matplotlib.pyplot as plt
%matplotlib inline


In [6]:
path = ''
pd_all = pd.read_csv(path + 'ChnSentiCorp_htl_all.csv')

pd_positive = pd_all[pd_all.label==1]
pd_negative = pd_all[pd_all.label==0]

def get_balance_corpus(corpus_size, corpus_pos, corpus_neg):
    sample_size = corpus_size // 2
    pd_corpus_balance = pd.concat([corpus_pos.sample(sample_size, replace=corpus_pos.shape[0]<sample_size), \
                                   corpus_neg.sample(sample_size, replace=corpus_neg.shape[0]<sample_size)])
    
    print('评论数目（总体）：%d' % pd_corpus_balance.shape[0])
    print('评论数目（正向）：%d' % pd_corpus_balance[pd_corpus_balance.label==1].shape[0])
    print('评论数目（负向）：%d' % pd_corpus_balance[pd_corpus_balance.label==0].shape[0])    
    
    return pd_corpus_balance

In [7]:
import hanlp
tokenizer = hanlp.load('LARGE_ALBERT_BASE')

ChnSentiCorp_htl_ba_2000 = get_balance_corpus(2000, pd_positive, pd_negative)


评论数目（总体）：2000
评论数目（正向）：1000
评论数目（负向）：1000


In [8]:
X = ChnSentiCorp_htl_ba_2000.review.tolist()
import re

def find_chn(list_of_string):  #提取汉字
    list_s = []
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    for s in list_of_string:
        s = re.sub(pattern,'',s)
        list_s.append(s)
    return list_s
        
X_chn = find_chn(X)

In [9]:
def trunc_str(list_of_string,length):   # 切割字符串，保留最后100个汉字
    X_cut = []
    for i in list_of_string:
        X_cut.append(i[-1*length:])
    return X_cut

X_cut = trunc_str(X_chn,100)

In [10]:
X_token = tokenizer(X_cut)  # 分词

In [31]:
X_token[0]

['于',
 '给到',
 '我们',
 '的',
 '房间',
 '与',
 '我们',
 '的',
 '要求',
 '有',
 '出入',
 '酒店',
 '人员',
 '立即',
 '想',
 '办法',
 '帮',
 '我们',
 '解决',
 '听说',
 '是',
 '他们',
 '的',
 '财务',
 '经理',
 '把',
 '房间',
 '让给',
 '了',
 '我们',
 '让',
 '我们',
 '很',
 '感动',
 '房间',
 '的',
 '价钱',
 '贵了点',
 '但',
 '物有所值',
 '下次',
 '去',
 '九寨沟',
 '还',
 '住',
 '喜来',
 '登',
 '这次',
 '汶川',
 '地震',
 '应该',
 '没有',
 '影响',
 '到',
 '他们',
 '吧',
 '祝',
 '一切',
 '平安']

In [24]:
from gensim.models import Word2Vec

In [25]:
model = Word2Vec(sentences=X_token, size = 100, window = 5, min_count=1, workers = 4)
model.save('word2vec.model')

In [26]:
model = Word2Vec.load('word2vec.model')

In [27]:
# 查看单词的词向量
vector = model.wv['缺点']
vector

array([-2.83350935e-04, -8.27597231e-02,  3.36558849e-01, -3.60424519e-02,
       -1.86550587e-01, -2.68017985e-02, -2.21636370e-01, -1.11095332e-01,
       -8.84630382e-02,  1.10094801e-01, -1.97244599e-01,  1.56114876e-01,
       -8.80174264e-02,  4.47545387e-02, -1.06314436e-01, -2.27199107e-01,
        5.25917597e-02, -1.73749235e-02, -2.33041599e-01,  1.61826819e-01,
        2.06463665e-01,  1.07508063e-01, -2.33329311e-01,  8.47866833e-02,
        1.36503294e-01, -3.50240548e-03, -8.46235976e-02,  1.85038134e-01,
        8.97972435e-02, -5.57922833e-02,  2.38069490e-01, -1.11047946e-01,
       -3.02047879e-01,  1.59190029e-01, -6.55211359e-02,  8.01820755e-02,
        1.00730203e-01,  1.68091476e-01, -5.44425473e-02, -8.55500326e-02,
       -2.04417184e-01,  8.49214643e-02, -1.00701056e-01, -9.04012248e-02,
       -3.79488128e-03,  1.11045867e-01, -9.65351355e-04, -7.00161234e-02,
       -1.31335765e-01, -4.58520204e-02, -9.54950675e-02,  4.37584892e-02,
       -7.24981129e-02,  

In [36]:
# 给定单词，找出含义最接近的词
request_count = 5
for key in model.wv.similar_by_word('抽烟',topn=10):
        if len(key[0])==2:
            request_count -= -1
            print(key[0],key[1])
            if request_count ==0:
                break

好用 0.930890679359436
晚饭 0.930504560470581
一会 0.9302356243133545
开灯 0.9301743507385254
矿业 0.9299736022949219
妹子 0.9298959970474243
表达 0.9295830726623535
到底 0.9292734861373901


In [28]:
print(model.wv.similarity('豪华', '商务'))  #余弦相似度

0.99982166


In [32]:
print(model.wv.similarity('平安', '听说'))

0.7920965
