In [12]:
#载入需要用到的包
import numpy as np 
import pandas as pd
import re
import itertools
#from LAC import LAC
import jieba #分词
import jieba.posseg as pseg #词性
from collections import Counter

# 数据清洗

In [13]:
#读取所有数据
data = pd.read_excel('京东评论-蓝牙耳机.xlsx')
def get_stopwords(stopwords):
    with open(stopwords,encoding="utf-8") as f:
        stopwords_list=[i.strip() for i in f.readlines()]
    return stopwords_list
stoplist = get_stopwords("stopwords.txt")#停用词词词典


In [14]:
#情感词词典
sentiment_pos = get_stopwords("./情感词典/清华大学李军中文褒贬义词典/tsinghua.positive.gb.txt")+get_stopwords("./情感词典/台湾大学NTUSD简体中文情感词典/ntusd-positive.txt")
sentiment_neg = get_stopwords("./情感词典/清华大学李军中文褒贬义词典/tsinghua.negative.gb.txt")+get_stopwords("./情感词典/台湾大学NTUSD简体中文情感词典/ntusd-negative.txt")
sentiment_ben = pd.read_excel('./情感词典/情感词汇本体/情感词汇本体.xlsx')
sentiment = sentiment_ben['词语'].values.tolist()+sentiment_pos+sentiment_neg
sentiment_pos += sentiment_ben['词语'].loc[sentiment_ben['极性']==1].values.tolist()
sentiment_neg += sentiment_ben['词语'].loc[sentiment_ben['极性']==2].values.tolist()
sentiment = list(set(sentiment))
sentiment_pos = list(set(sentiment_pos))
sentiment_neg = list(set(sentiment_neg))

In [15]:
data = data.drop(columns=['追评时间','追评内容','商品属性','页面网址','采集时间'])
data = data.drop_duplicates()
data.dropna(subset=['评价内容'],inplace=True)
data.isna().sum()

会员               0
级别             692
评价星级             0
评价内容             0
时间               0
点赞数              0
评论数            100
页面标题             0
Helpfulness      0
dtype: int64

In [16]:
# split by punctuation
pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】|·|！| |…|（|）|：|\n|•|？'
def sub(b):
    b = re.split(pattern, b)
    return b
data['Subse'] = data['评价内容'].apply(sub)
# 使用上方法分割子句时，如果存在两个标点重复的情况，会出现空值，因此删去
for i in range(len(data)):
    a = data['Subse'].iloc[i]
    while '' in a:
        a.remove('')
    data['Subse'].iloc[i] = a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [21]:
# 查看分割效果
print(data['评价内容'].iloc[19])
data['Subse'].iloc[19]

上午十点下单，下午三点半就到了，快递速度快到离谱。
音质还行，然后连接速度一般，电量可以用很久。


['上午十点下单', '下午三点半就到了', '快递速度快到离谱', '音质还行', '然后连接速度一般', '电量可以用很久']

In [22]:
# 评价分词，使用LAC模型，用于提取名词、形容词、动词数量
#lac = LAC(mode='lac')
#data['评价分词'] = data['评价内容'].apply(lac.run)
data['评价分词'] = data['评价内容'].apply(pseg.cut)
# jieba分词后返回值难以处理，这里转换成list
data['评价分词'] = data['评价分词'].apply(list)
for i in range(len(data)):
    b = data['评价分词'].iloc[i]
    for j in range(len(b)):
        b[j] = list(b[j])
    b = list(map(list, zip(*b))) #将列表转置，方便后续分析
    data['评价分词'].iloc[i] = b

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\DELL\AppData\Local\Temp\jieba.cache
Loading model cost 0.733 seconds.
Prefix dict has been built successfully.


In [23]:
# 评论分词，用于构建语料库，文本分析
# lac2 = LAC(mode='seg')
# data['Segment'] = data['Subse'].apply(lac2.run)
# for i in range(len(data)):
#     a = data['Segment'].iloc[i]
#     a = list(itertools.chain(*a))
#     data['Segment'].iloc[i] = a
def jiebaba(text):
    a = list(jieba.cut(text))
    return a
data['Segment'] = ''
for i in range(len(data)):
    a = data['Subse'].iloc[i]
    b = []
    for str in a:
        b.append(jiebaba(str))
    b = list(itertools.chain(*b))
    data['Segment'].iloc[i] = b

In [42]:
print(data['评价内容'].iloc[24])
print(data['Segment'].iloc[24])

音质还行  带着入耳式还是有点不舒服
['音质', '还', '行', '带', '着', '入耳式', '还是', '有点', '不', '舒服']


# 特征提取

## Readability

In [40]:
# Sub-sentences length
data['Subse_Count']=''
for i in range(len(data)):
    a = data['Subse'].iloc[i]
    b = list(range(len(a)))
    for j in range(len(a)):
        b[j] = len(a[j])
    data['Subse_Count'].iloc[i] = b

In [41]:
#评论长度大于50个词时，认为有效度一样，因为字数已经足够
def length(num):
    if num>=50:
        num = 50
    else:
        num = num
    return num

In [42]:
# number of words in a review
data['Words'] = data['Subse_Count'].apply(sum)
data['Words'] = data['Words'].apply(length)

In [43]:
pd.to_numeric(data['Words']).describe()

count    1032.000000
mean       21.345930
std        15.024503
min         1.000000
25%        10.000000
50%        17.000000
75%        30.000000
max        50.000000
Name: Words, dtype: float64

In [44]:
# number of sub-sentences
data['Nsub'] = data['Subse'].apply(len)

In [45]:
# average length of sub-sentences
data['Asub'] = data['Subse_Count'].apply(np.mean)

In [46]:
# number of adjectives
data['adjectives']=''
for i in range(len(data)):
    a = data['评价分词'].iloc[i]
    data['adjectives'].iloc[i] = a[1].count('a')
pd.to_numeric(data['adjectives']).describe()

count    1032.000000
mean        1.576550
std         1.561526
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max        11.000000
Name: adjectives, dtype: float64

In [47]:
data['评价分词'].iloc[150]

[['耳机', '还', '不错', '，', '戴', '着', '耳朵', '也', '不疼'],
 ['n', 'd', 'a', 'x', 'v', 'uz', 'n', 'd', 'a']]

In [48]:
# number of adverbs
data['adverbs']=''
for i in range(len(data)):
    a = data['评价分词'].iloc[i]
    data['adverbs'].iloc[i] = a[1].count('d')
pd.to_numeric(data['adverbs']).describe()

count    1032.000000
mean        2.085271
std         2.535585
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max        19.000000
Name: adverbs, dtype: float64

In [49]:
# number of verbs
data['verbs']=''
for i in range(len(data)):
    a = data['评价分词'].iloc[i]
    data['verbs'].iloc[i] = a[1].count('v')
pd.to_numeric(data['verbs']).describe()

count    1032.000000
mean        3.437016
std         5.124058
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max        59.000000
Name: verbs, dtype: float64

### Entropy

In [50]:
# Entropy，使用所有评论构建语料库
data2 = data
corpus = []
for i in range(len(data2)):
    #corpus = set.union(corpus,b)
    corpus = corpus+data2['Segment'].iloc[i]
# 统计语料库中每个元素的数量
corpus2 = Counter(corpus)

In [51]:
# 计算每个词在语料库中出现的概率
data['Entropy_p']=''
for i in range(len(data)):
    a = data['Segment'].iloc[i]
    p = list(range(len(a)))
    for j in range(len(a)):
        p[j] = corpus2[a[j]]/len(corpus2)
    data['Entropy_p'].iloc[i] = p

In [52]:
# 计算logp
data['Entropy_logp'] = ''
for i in range(len(data)):
    a = data['Entropy_p'].iloc[i]
    data['Entropy_logp'].iloc[i] = np.log(a)
# 计算entropy
data['Entropy'] = data['Entropy_p']*data['Entropy_logp']
data['Entropy'] = -data['Entropy'].apply(sum)

## Reliability

### Diversity

In [53]:
#统计某一评论出现了多少个不同词语（或单字）
def freq(item):
    c = set(item)
    return len(c)
data['Diversity'] = data['Segment'].apply(freq)/data['Segment'].apply(len)

In [54]:
pd.to_numeric(data['Diversity']).describe()

count    1032.000000
mean        0.939000
std         0.093279
min         0.250000
25%         0.894737
50%         1.000000
75%         1.000000
max         1.000000
Name: Diversity, dtype: float64

### Polarity

In [55]:
# 0~1 -> 从积极到消极
data['Polarity'] = ''
for i in range(len(data)):
    a = data['Segment'].iloc[i]
    pos = [i for i in a if i in sentiment_pos]
    neg = [i for i in a if i in sentiment_neg]
    if len(pos)+len(neg) != 0:
        data['Polarity'].iloc[i] = len(neg)/(len(pos)+len(neg))
pd.to_numeric(data['Polarity']).describe()
# 对于没有包含情感词的评论，默认Polarity的值为0.5（即褒贬词数量相同的情况）
data['Polarity'].loc[data['Polarity']==''] = 0.5
# data['评价内容'].loc[data['Polarity']=='']

## 一些尚未分类的特征

In [56]:
# Elapsed days from review published dates to the latest review published date
data['时间'] = pd.to_datetime(data['时间'])
data['Timeliness'] = data['时间'].max()-data['时间']
data['Timeliness'] = data['Timeliness'].map(lambda x:x.days)

In [57]:
# Review rating
data['评价星级'].loc[data['评价星级']=='star5']=5
data['评价星级'].loc[data['评价星级']=='star4']=4
data['评价星级'].loc[data['评价星级']=='star3']=3
data['评价星级'].loc[data['评价星级']=='star2']=2
data['评价星级'].loc[data['评价星级']=='star1']=1
print(data['评价星级'].value_counts())
data.rename(columns={'评价星级': 'Rating'}, inplace=True)

5    762
1    101
3     72
4     59
2     38
Name: 评价星级, dtype: int64


In [58]:
# 是否为会员，用于判断是否经常购物
data['VIP'] = data['级别'].apply(lambda x:1 if x=='PLUS会员' else 0)
data['VIP'].value_counts()

0    700
1    332
Name: VIP, dtype: int64

### Relevancy/Subjectivity

In [31]:
# 首先将提前构建的语料库（entropy）根据词频排序，并提取前2000（这个数量有待考究!!!!!!!!!!!!!!!!!!!），并去除停用词和情感词
corpus30 = pd.DataFrame(corpus)
corpus30 = corpus30.value_counts().rename('counts').reset_index()
corpus33 = []
for i in range(len(corpus30)):
    #corpus = set.union(corpus,b)
    corpus33.append(corpus30[0].iloc[i])
#corpus33是排序完后的所有词，corpus34为取前n个元素并去停用词和情感词
corpus34 = corpus33[0:2000]
corpus34 = [i for i in corpus34 if i not in stoplist] #去除停用词
corpus34 = [i for i in corpus34 if i not in sentiment] #去除情感词，因为后续有分析情感特征 

In [32]:
# 导出词典
str = '\n'
f=open("corpus.txt","w",encoding='utf-8')
f.write(str.join(corpus34))
f.close()

In [59]:
corpus34 = get_stopwords("corpus.txt")

In [60]:
# 计算相关性的函数（计算评论中有几个主题相关词）
def relate(review,sen_list):
    set1 = set(review)
    iset = set1.intersection(set(sen_list))
    return len(iset)

In [61]:
# 计算相关性并添加到列‘Relevancy’
title = data['页面标题'].value_counts().reset_index()#提取所有商品的标题
title['Seg'] = ''
Relevancy = []
for i in range(len(title)):
    a = title['index'].iloc[i]
    b = jiebaba(a)
    while ' ' in b:
        b.remove(' ')
    # 根据同一个商品的页面标题添加到语料库，形成该商品专门的词典
    c = data.loc[data['页面标题']==title['index'].iloc[i]] #c为同一个商品的所有数据
    d = corpus34+b #构建该商品的语料库
    for j in range(len(c)):
        e = relate(c['Segment'].iloc[j],d)
        Relevancy.append(e)
data['Relevancy'] = Relevancy
data['Relevancy'] = pd.to_numeric(data['Relevancy'])

# 导出数据

In [62]:
data.isnull().sum()

会员                0
级别              692
Rating            0
评价内容              0
时间                0
点赞数               0
评论数             100
页面标题              0
Helpfulness       0
Subse             0
评价分词              0
Segment           0
Subse_Count       0
Words             0
Nsub              0
Asub              0
adjectives        0
adverbs           0
verbs             0
Entropy_p         0
Entropy_logp      0
Entropy           0
Diversity         0
Polarity          0
Timeliness        0
VIP               0
Relevancy         0
dtype: int64

In [63]:
data.columns

Index(['会员', '级别', 'Rating', '评价内容', '时间', '点赞数', '评论数', '页面标题', 'Helpfulness',
       'Subse', '评价分词', 'Segment', 'Subse_Count', 'Words', 'Nsub', 'Asub',
       'adjectives', 'adverbs', 'verbs', 'Entropy_p', 'Entropy_logp',
       'Entropy', 'Diversity', 'Polarity', 'Timeliness', 'VIP', 'Relevancy'],
      dtype='object')

In [64]:
data2 = data.drop(columns=['会员','级别','评价内容','时间','点赞数','评论数','页面标题','Subse','评价分词','Segment','Subse_Count','Entropy_p', 'Entropy_logp'])
data2.to_excel('耳机2.xlsx', index=False)
#data2.to_csv('耳机.csv', index=False)