In [1]:
import numpy as np
import pandas as pd
import re
import itertools
from collections import Counter
import pickle
import jieba
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

# Read Dataset

In [2]:
data = pd.read_csv('./online_shopping_10_cats.csv')
data = data[['cat', 'review']]
data.sample(10)

Unnamed: 0,cat,review
13460,平板,产品不怎么样！！
24725,水果,这苹果，快吃完了居然没有一个好吃的，并且都小的很，和图片严重不同！对京东太失望了，下次还是去...
55199,酒店,总的来说酒店还是很好的，有机会还会住该家酒店。早餐很丰富，而且还是免费的，吃起来很爽，唯一的...
60294,酒店,"酒店实在太差了,房间隔音极差,北塔挨着迪吧，快曲放到半夜3点；换到南塔，又挨着KTV音乐响到..."
34295,洗发水,都特么漏了，瓶子粘糊糊的，口松了，里面的东西流出来了，太恶心了，也不知道过期没有
16386,水果,老潘推荐的苹果确实不错，和新疆冰糖心有一拼，口感不错，甜而且水分足. 京东物流给力，送来的保...
26831,洗发水,第一次购买无硅油洗发水，希望使用效果能好！
39611,衣服,大小合适，棒棒哒！以后还会光顾！赞一个！ 客服服务态度也非常好。
24834,水果,她妈的骗人的，10个就2个是好的，其余全部烂的，而且2个还是非常小，大家千万不要买！
15907,手机,原先在VK520有的Java功能，到VK530不见了实在令人有点失望。无声短片玩起来就太没有...


# Data Preprocess

In [4]:
# Select 1000 items for quick calculation, I selected 100 reviews from each category to make a balanced dataset
# select_data = data.groupby('cat', group_keys=False).apply(pd.DataFrame.sample, 100)

In [3]:
def clean_data(text):
    cleaned_text = re.sub(r"[\s\/\\_$^*(+\"\'+~\-@#&^*\[\]{}【】]+", "", str(text))
    return cleaned_text

In [4]:
data['cleaned_text'] = data.review.apply(clean_data)

In [5]:
data.cleaned_text.apply(len).describe()

count    62774.000000
mean        57.765986
std         78.159904
min          1.000000
25%         20.000000
50%         34.000000
75%         63.000000
max       2876.000000
Name: cleaned_text, dtype: float64

In [8]:
select_data = data[data.cleaned_text.apply(lambda s: 15 < len(s) < 40)]

In [10]:
select_data = select_data.groupby('cat', group_keys=False).apply(pd.DataFrame.sample, 100)

# Build Dictionary

In [13]:
select_data['words'] = select_data.cleaned_text.apply(jieba.lcut)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/1z/qy9y5ltd4_1_fsx9w6pw0x9c0000gn/T/jieba.cache
Loading model cost 0.940 seconds.
Prefix dict has been built succesfully.


In [32]:
import gensim
from gensim.utils import simple_preprocess

dictionary = gensim.corpora.Dictionary(select_data.words)

In [33]:
dictionary.filter_extremes()

# Transform Corpus

In [34]:
bow_corpus = [dictionary.doc2bow(doc) for doc in select_data.words]

# Run

In [35]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary)

In [36]:
topic_dic = dict()
for idx, topic in lda_model.print_topics(-1):
    topic_dic['topic {}'.format(idx)] = topic.split('+')
pd.DataFrame(topic_dic)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,"0.048*""我""","0.042*""。""","0.047*""了""","0.070*""了""","0.054*""了""","0.084*""！""","0.094*""。""","0.060*""。""","0.067*""！""","0.150*""。"""
1,"0.033*""不""","0.041*""了""","0.035*""。""","0.054*""。""","0.040*""很""","0.054*""了""","0.025*""不""","0.044*""好""","0.044*""很""","0.059*""了"""
2,"0.025*"".""","0.038*""很""","0.025*""是""","0.038*""！""","0.036*"",""","0.048*""很""","0.024*""？""","0.034*""了""","0.039*""是""","0.022*""还"""
3,"0.025*""了""","0.036*""不错""","0.025*""买""","0.030*""很""","0.036*""！""","0.039*""。""","0.023*""不错""","0.032*""很""","0.034*""了""","0.020*""不"""
4,"0.021*""!""","0.030*""也""","0.023*""我""","0.024*""买""","0.031*""。""","0.027*""不错""","0.020*""了""","0.021*""也""","0.032*""。""","0.020*""…"""
5,"0.021*""有""","0.029*""还""","0.023*""还""","0.020*""也""","0.028*""好""","0.017*""就""","0.020*""还""","0.019*""！""","0.024*""蒙牛""","0.017*""也"""
6,"0.016*""用""","0.019*""就""","0.021*""很""","0.018*""不错""","0.024*"".""","0.015*""蒙牛""","0.018*""也""","0.018*""不""","0.019*""不""","0.016*""好"""
7,"0.015*""还是""","0.017*""好""","0.019*""不""","0.017*""还""","0.020*""不错""","0.015*""买""","0.015*""买""","0.016*""都""","0.018*""不错""","0.013*""买"""
8,"0.014*""就""","0.016*""！""","0.018*""应该""","0.015*""不""","0.019*""都""","0.013*""也""","0.014*""好""","0.013*""买""","0.015*"",""","0.012*""不错"""
9,"0.014*"",""","0.012*""质量""","0.015*""用""","0.015*""京东""","0.018*""我""","0.013*""都""","0.014*""没有""","0.012*""是""","0.013*""好""","0.012*""是"""


# Improve it

The result is pretty bad. They are almost stop words. And later I'll do some optimization to improve the results. First put above codes into a function for convenience.

In [37]:
def get_lda(corpus, words_no_above=0.5, num_topics=10):
    dictionary = gensim.corpora.Dictionary(corpus)
    dictionary.filter_extremes(no_above=words_no_above)
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary)
    topic_dic = dict()
    for idx, topic in lda_model.print_topics(-1):
        topic_dic['topic {}'.format(idx)] = topic.split('+')
    return pd.DataFrame(topic_dic)

## Change parameters

Try with a different threshold of word frequence

In [38]:
get_lda(select_data.words, words_no_above=0.1, num_topics=10)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,"0.022*""没""","0.044*"",""","0.044*"",""","0.025*""蒙牛""","0.023*""差""","0.033*""有""","0.022*""在""","0.045*""蒙牛""","0.031*""…""","0.020*""感觉"""
1,"0.022*""用""","0.020*""～""","0.035*""蒙牛""","0.024*""!""","0.019*""?""","0.020*""给""","0.017*""没有""","0.028*"",""","0.019*""、""","0.018*""有"""
2,"0.015*""可以""","0.016*""比较""","0.023*""？""","0.019*""在""","0.017*""服务""","0.018*""到""","0.016*""一样""","0.021*""没有""","0.018*""质量""","0.016*""没有"""
3,"0.015*""还是""","0.016*"".""","0.020*""就是""","0.018*"".""","0.015*""用""","0.017*""就是""","0.015*""喜欢""","0.019*"".""","0.018*""非常""","0.015*""用"""
4,"0.013*""感觉""","0.015*""？""","0.016*""没有""","0.018*""…""","0.013*""一个""","0.017*""来""","0.015*""用""","0.017*""喝""","0.018*"".""","0.015*""什么"""
5,"0.012*""方便""","0.015*""京东""","0.015*"".""","0.015*""没""","0.013*""安装""","0.017*""质量""","0.015*""可以""","0.014*""就是""","0.013*""不好""","0.015*""说"""
6,"0.011*""说""","0.015*""在""","0.011*""挺""","0.015*""又""","0.013*""挺""","0.015*""再""","0.014*""、""","0.011*""比较""","0.012*""满意""","0.014*""小"""
7,"0.011*""在""","0.014*""比""","0.010*""有点""","0.013*""比较""","0.011*""给""","0.015*""京东""","0.012*""不是""","0.011*""小""","0.012*""价格""","0.013*""可以"""
8,"0.010*""质量""","0.013*""没有""","0.010*""喜欢""","0.013*""支持""","0.011*""裤子""","0.013*""蒙牛""","0.012*""安装""","0.011*""头发""","0.011*""外观""","0.013*""一般"""
9,"0.010*""速度""","0.013*""有点""","0.009*""味道""","0.013*""太""","0.011*""和""","0.013*""物流""","0.011*""价格""","0.010*""安装""","0.011*""用""","0.013*""和"""


The stop words are almost filtered, but I still cannot see any topic. Try with a smaller threshold.

In [45]:
get_lda(select_data.words, words_no_above=0.01, num_topics=10)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,"0.024*""月""","0.082*""～""","0.030*""铃声""","0.041*""；""","0.026*""好用""","0.032*""价位""","0.035*""出""","0.026*""机器""","0.027*""；""","0.039*""够"""
1,"0.024*""上网""","0.039*""哈哈哈""","0.030*""待机时间""","0.025*""孩子""","0.021*""月""","0.026*""把""","0.026*""牛奶""","0.023*""实用""","0.023*""被""","0.020*""环境"""
2,"0.020*""抵制""","0.024*""期待""","0.026*""驱动""","0.021*""低""","0.021*""怎么""","0.021*""抵制""","0.022*""超级""","0.021*""不如""","0.023*""太小""","0.020*""介绍"""
3,"0.020*""无""","0.020*""优点""","0.022*""分""","0.021*""：""","0.021*""价钱""","0.021*""它""","0.018*""～""","0.021*""一段时间""","0.018*""完全""","0.020*""划算"""
4,"0.020*""XP""","0.020*""甜""","0.017*""～""","0.017*""设计""","0.021*""大小""","0.021*""为""","0.018*""只是""","0.021*""超值""","0.018*""宝贝""","0.020*""你们"""
5,"0.020*""还好""","0.020*""本来""","0.017*""下载""","0.017*""看起来""","0.021*""声音""","0.021*""只""","0.018*""好像""","0.021*""像""","0.018*""网上""","0.020*""稍"""
6,"0.020*""等""","0.020*""超市""","0.017*""完美""","0.017*""之后""","0.021*""环境""","0.021*""款式""","0.018*""元""","0.016*""开""","0.018*""优惠""","0.020*""相当"""
7,"0.015*""大小""","0.015*""为什么""","0.017*""清晰""","0.017*""总体""","0.021*""别人""","0.021*""师傅""","0.018*""你们""","0.016*""拿到""","0.014*""～""","0.020*""更"""
8,"0.015*""伊利""","0.015*""洗""","0.017*""不想""","0.017*""味""","0.016*""无""","0.016*""奶""","0.018*""哦""","0.016*""机""","0.014*""假货""","0.017*""性能"""
9,"0.015*""蓝牙""","0.015*""心""","0.017*""实用""","0.017*""一天""","0.016*""鼠标""","0.016*""电影""","0.013*""开始""","0.016*""选择""","0.014*""几天""","0.016*""别的"""


It makes much more sense now. 

In [57]:
get_lda(select_data.words, words_no_above=0.017, num_topics=10)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,"0.020*""需要""","0.019*""怎么""","0.022*""华为""","0.023*""你""","0.027*""～""","0.023*""新鲜""","0.043*""…""","0.026*""哈哈哈""","0.075*""…""","0.055*""～"""
1,"0.015*""内存""","0.016*""那么""","0.019*""贵""","0.021*""颜色""","0.019*""更""","0.021*""头发""","0.024*""功能""","0.026*""完""","0.023*""；""","0.019*""奶"""
2,"0.013*""没用""","0.016*""想""","0.016*""算""","0.020*""慢""","0.016*""系统""","0.018*""1""","0.020*""2""","0.019*""系统""","0.023*""电池""","0.019*""有些"""
3,"0.013*""孩子""","0.016*""你""","0.016*""显示""","0.019*""元""","0.014*""划算""","0.018*""；""","0.017*""?""","0.016*""品牌""","0.015*""这么""","0.018*""做"""
4,"0.013*""图片""","0.016*""做工""","0.015*""会""","0.017*""行""","0.014*""下""","0.018*""位置""","0.017*""一下""","0.016*""推荐""","0.015*""较""","0.016*""早餐"""
5,"0.013*""会""","0.013*""散热""","0.014*""早餐""","0.017*""实用""","0.014*""像素""","0.018*""不过""","0.017*""为什么""","0.013*""分""","0.015*""内容""","0.016*""最"""
6,"0.013*""差评""","0.013*""月""","0.014*""差评""","0.014*""实惠""","0.014*""实惠""","0.016*""够""","0.016*""真""","0.013*""过""","0.013*""假货""","0.016*""能"""
7,"0.013*""购物""","0.013*""?""","0.014*""MP3""","0.014*""时间""","0.014*""到位""","0.016*""你""","0.015*""过""","0.013*""这样""","0.013*""垃圾""","0.016*""酸奶"""
8,"0.010*""后悔""","0.013*""书""","0.014*""想象""","0.014*""甜""","0.014*""客服""","0.016*""牛奶""","0.014*""烂""","0.013*""书""","0.013*""设计""","0.013*""…"""
9,"0.010*""有些""","0.013*""环境""","0.014*""清晰""","0.014*""不过""","0.012*""活动""","0.013*""赠品""","0.014*""拍照""","0.013*""你们""","0.013*""不行""","0.013*""时候"""


Try with a different number of topic

In [58]:
get_lda(select_data.words, words_no_above=0.017, num_topics=15)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9,topic 10,topic 11,topic 12,topic 13,topic 14
0,"0.022*""假货""","0.020*""系统""","0.022*""洗发水""","0.028*""…""","0.132*""…""","0.025*""系统""","0.037*""散热""","0.033*""甜""","0.020*""时""","0.051*""?""","0.034*""书""","0.077*""～""","0.017*""驱动""","0.022*""不会""","0.028*""超级"""
1,"0.022*""实惠""","0.020*""很多""","0.018*""孩子""","0.024*""分""","0.027*""哈哈哈""","0.017*""时""","0.021*""配置""","0.024*""够""","0.016*""1""","0.023*""配置""","0.021*""；""","0.046*""你""","0.017*""电池""","0.022*""这样""","0.023*""头发"""
2,"0.022*""产品""","0.020*""这么""","0.018*""拍照""","0.020*""MP3""","0.016*""～""","0.017*""；""","0.021*""推荐""","0.024*""跟""","0.016*""做工""","0.017*""客服""","0.021*""送货""","0.020*""?""","0.017*""发货""","0.017*""颜色""","0.023*""坏"""
3,"0.017*""头皮屑""","0.020*""还会""","0.014*""内容""","0.016*""为什么""","0.016*""内存""","0.017*""稍微""","0.021*""算""","0.021*""颜色""","0.016*""；""","0.017*""现在""","0.021*""划算""","0.020*""被""","0.017*""好多""","0.017*""不如""","0.019*""不过"""
4,"0.017*""时候""","0.016*""性能""","0.014*""不会""","0.016*""华为""","0.016*""较""","0.013*""～""","0.021*""住""","0.019*""不要""","0.012*""送货""","0.017*""衣服""","0.021*""售后""","0.016*""鼠标""","0.017*""3""","0.017*""第二次""","0.014*""有些"""
5,"0.017*""手机""","0.016*""那么""","0.014*""无""","0.016*""衣服""","0.016*""还好""","0.013*""麻烦""","0.021*""价钱""","0.019*""真""","0.012*""内存""","0.017*""对""","0.017*""内容""","0.016*""后悔""","0.017*""真是""","0.014*""抵制""","0.014*""去"""
6,"0.013*""最""","0.016*""会""","0.014*""过""","0.012*""现在""","0.016*""期待""","0.013*""会""","0.018*""老公""","0.017*""着""","0.012*""力""","0.017*""大家""","0.017*""购物""","0.012*""不行""","0.017*""很大""","0.013*""…""","0.014*""过"""
7,"0.013*""第一次""","0.016*""声音""","0.014*""月""","0.012*""孩子""","0.016*""优点""","0.013*""相当""","0.016*""太小""","0.014*""大床""","0.012*""3""","0.017*""时间""","0.017*""能""","0.012*""发货""","0.017*""早餐""","0.013*""里面""","0.014*""你们"""
8,"0.013*""差评""","0.016*""新鲜""","0.014*""牌子""","0.012*""够""","0.016*""不行""","0.013*""功能""","0.016*""款式""","0.014*""图片""","0.012*""漂亮""","0.017*""评价""","0.015*""设计""","0.012*""想""","0.013*""键盘""","0.013*""产品""","0.014*""连"""
9,"0.013*""：""","0.012*""因为""","0.013*""行""","0.012*""字""","0.016*""完""","0.013*""像素""","0.016*""新""","0.014*""只""","0.012*""后""","0.017*""不过""","0.013*""?""","0.012*""牛奶""","0.013*""儿子""","0.013*""实在""","0.014*""这么"""


In [60]:
get_lda(select_data.words, words_no_above=0.017, num_topics=8)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7
0,"0.024*""实惠""","0.024*""…""","0.047*""?""","0.019*""做工""","0.023*""～""","0.028*""得""","0.016*""配置""","0.052*""…"""
1,"0.017*""不行""","0.022*""不要""","0.017*""怎么""","0.017*""手机""","0.015*""电池""","0.024*""你""","0.014*""内存""","0.031*""～"""
2,"0.012*""头发""","0.022*""不过""","0.015*""低""","0.017*""去""","0.014*""你们""","0.017*""抵制""","0.014*""早餐""","0.017*""键盘"""
3,"0.012*""更""","0.022*""产品""","0.015*""；""","0.014*""看起来""","0.013*""时候""","0.017*""较""","0.014*""内容""","0.017*""头发"""
4,"0.012*""这次""","0.019*""好多""","0.015*""旧""","0.014*""超级""","0.013*""时间""","0.015*""哈哈哈""","0.012*""…""","0.015*""头皮屑"""
5,"0.012*""第二次""","0.014*""月""","0.015*""做""","0.014*""赞""","0.013*""需要""","0.015*""这样""","0.012*""想象""","0.015*""差评"""
6,"0.010*""鼠标""","0.014*""后悔""","0.013*""很大""","0.014*""新鲜""","0.011*""现在""","0.015*""等""","0.012*""吗""","0.013*""假货"""
7,"0.010*""能""","0.011*""1""","0.010*""手感""","0.012*""很多""","0.011*""内存""","0.013*""…""","0.012*""那么""","0.013*""女儿"""
8,"0.010*""系统""","0.011*""开始""","0.010*""中""","0.012*""书""","0.011*""最""","0.011*""功能""","0.012*""平板""","0.013*""有些"""
9,"0.010*""散热""","0.011*""过""","0.010*""发票""","0.012*""不值""","0.011*""你""","0.011*""会""","0.012*""少""","0.012*""客服"""


## Simplify Corpus
I'll use POS Tagging to remove unnecessary words

In [71]:
import jieba.posseg as pseg

In [93]:
def keep_words(s, to_keep=['ns', 'v', 'n', 'a']):
    s_ = []
    pos_list = pseg.lcut(s)
    for item in pos_list:
        if item.flag in to_keep:
            s_.append(item.word)
    return s_

In [94]:
keep_words('这苹果，快吃完了居然没有一个好吃的，并且都小的很，和图片严重不同！对京东太失望了，下次还是去...')

['苹果', '快', '吃', '完', '没有', '好吃', '小', '很', '图片', '严重', '不同', '京东', '失望', '去']

In [95]:
select_data['words_lite'] = select_data.cleaned_text.apply(keep_words)

In [106]:
get_lda(select_data.words_lite, words_no_above=0.027, num_topics=9)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8
0,"0.033*""贵""","0.035*""好吃""","0.031*""去""","0.029*""问题""","0.030*""酒店""","0.023*""显示""","0.030*""时候""","0.039*""垃圾""","0.046*""支持"""
1,"0.022*""起来""","0.029*""还有""","0.031*""好评""","0.020*""坏""","0.024*""手感""","0.022*""电池""","0.027*""朋友""","0.035*""装""","0.038*""要"""
2,"0.019*""速度""","0.029*""吃""","0.025*""人""","0.020*""评""","0.024*""送""","0.020*""速度""","0.027*""速度""","0.027*""好看""","0.030*""功能"""
3,"0.019*""应该""","0.027*""能""","0.022*""酒店""","0.017*""不能""","0.024*""适合""","0.020*""起来""","0.023*""装""","0.023*""便宜""","0.026*""吃"""
4,"0.019*""手机""","0.024*""支持""","0.022*""时间""","0.017*""人""","0.024*""售后""","0.020*""穿""","0.023*""知道""","0.023*""性价比""","0.017*""性价比"""
5,"0.019*""送""","0.019*""头发""","0.019*""不行""","0.017*""头发""","0.023*""性价比""","0.017*""人""","0.020*""使用""","0.020*""好评""","0.017*""裤子"""
6,"0.019*""旧""","0.016*""知道""","0.019*""不值""","0.017*""看着""","0.020*""住""","0.017*""机子""","0.019*""想""","0.020*""人""","0.017*""颜色"""
7,"0.016*""裤子""","0.016*""书""","0.019*""内容""","0.017*""甜""","0.020*""散热""","0.017*""发现""","0.017*""做工""","0.016*""环境""","0.017*""衣服"""
8,"0.016*""物流""","0.016*""正品""","0.016*""要""","0.017*""洗""","0.020*""位置""","0.017*""裤子""","0.017*""发货""","0.016*""内存""","0.017*""产品"""
9,"0.016*""产品""","0.016*""态度""","0.016*""应该""","0.014*""酒店""","0.020*""效果""","0.017*""觉得""","0.017*""酒店""","0.016*""颜色""","0.015*""觉得"""


It seems better than before.