In [4]:
import os
import pyLDAvis
import pyLDAvis.sklearn
import jieba
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

base_dir = './data'
files = sorted([os.path.join(base_dir, file) for file in os.listdir(base_dir) if file.endswith('.csv')])

filename = files[0]
df = pd.read_csv(filename)

suggest_words = ['拍拍贷', '上海拍拍贷', '合肥拍拍贷', '长沙拍拍贷', '拍拍贷法务部']
for word in suggest_words:
    jieba.suggest_freq(word, True)

df['content_cut_words'] = df['content'].map(lambda s: ' '.join(jieba.cut(s)))

stopwords = []
for word in open('stopwords.txt', encoding='utf8', mode='r'):
    stopwords.append(word.strip())

corpus = df['content_cut_words'].values

n_features = 1000
cntVector = CountVectorizer(strip_accents='unicode',
                            stop_words=stopwords,
                            max_features=n_features)
cntTf = cntVector.fit_transform(corpus)
featureNames = cntVector.get_feature_names()

lda = LatentDirichletAllocation(n_components=5,
                                learning_offset=50.,
                                random_state=0,
                                learning_method='batch')
docres = lda.fit_transform(cntTf)

for idx, topic in enumerate(lda.components_):
    print('topic %s: ' % idx)
    wordDist = sorted([(i, v) for i, v in enumerate(topic)], key=lambda x: x[1], reverse=True)
    top10 = ' + ' .join([str(value) + '*' + featureNames[i] for (i, value) in wordDist[:10]])
    print(top10)


pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, cntTf, cntVector)
data = pyLDAvis.sklearn.prepare(lda, cntTf, cntVector)
pyLDAvis.show(data, open_browser=True, ip='172.20.7.199', port=8888)


topic 0: 
438.4093724240194*拍拍贷法务部 + 155.54185297897166*抱歉 + 35.11733078845706*打错 + 29.358144580075127*一下张 + 17.197861065603895*杨佳伟 + 15.99814427660592*微信 + 15.36277208275262*号码 + 15.19660423933441*李涛 + 13.197078616482974*王志伟 + 11.464812992719823*王家
topic 1: 
311.42318451382255*贷款 + 259.66048658398074*转告 + 199.54998518338152*拍拍贷法务部 + 158.42446697114812*派出所 + 147.03307346881704*告诉 + 146.0033710332369*签收 + 142.91087481943958*欠款 + 137.5751711386423*律师 + 131.80858340465423*信函 + 126.45198969060412*户籍地
topic 2: 
1076.4621358228617*贷款 + 628.0508525805459*转告 + 434.9715737057026*派出所 + 392.1309542011366*拍拍贷法务部 + 383.02713613920304*签收 + 377.8413711279729*告诉 + 361.6038575725574*号码 + 339.9443595585528*户籍地 + 321.8209785230032*律师 + 314.5881210417666*信函
topic 3: 
247.3634631827102*欠款 + 161.34152769279572*理解 + 148.11747449511694*还款 + 146.90385594601398*时间 + 145.09895497261192*块钱 + 130.7309783438638*电话 + 102.40087073320235*一共 + 99.19229514138974*金额 + 97.45837428248458*情况 + 90.18975557485541*借款
topic 4: 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://172.20.7.199:8889/    [Ctrl-C to exit]


172.20.7.199 - - [17/Jan/2019 16:35:35] "GET / HTTP/1.1" 200 -
172.20.7.199 - - [17/Jan/2019 16:35:35] "GET /LDAvis.css HTTP/1.1" 200 -
172.20.7.199 - - [17/Jan/2019 16:35:35] "GET /d3.js HTTP/1.1" 200 -
172.20.7.199 - - [17/Jan/2019 16:35:35] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...
