In [1]:
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from tqdm import tqdm
from nltk.corpus import stopwords
stopwordEn = stopwords.words('english')
from nltk.corpus import wordnet
import pickle
import multiprocessing as mp
import gensim
from gensim import corpora
import pyLDAvis.gensim

print('CPU numbers:',mp.cpu_count())
def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)
def apply_by_multiprocessing(df, func, **kwargs):
#     print(kwargs)
    workers = kwargs.pop('workers')
    pool = mp.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))
#apply_by_multiprocessing(fullset['Text'], process_text, workers=cores)
def lemmaWord(word):
    lemma = wordnet.morphy(word)
    if lemma is not None:
        return lemma
    else:
        return word
def processText(text,lemma=False, gram=1):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    tokens = word_tokenize(text)
    tokens = [lemmaWord(i.lower()) if lemma else i.lower() for i in tokens if i.lower() not in stopwordEn and i.isalpha()]
    if gram<=1:
        return tokens
    else:
        return [i for i in nltk.ngrams(tokens, gram)]

CPU numbers: 32


# Small Sample

In [2]:
test_sub = ["My name is Marshall","how are you today?", "you are alex, the bad guy", "Marshall is a happy name",
            "alex is your name","alex is really bad","I am happy, marshall said","you're bad, so alex is bad","Bad guy alex finally meet happy marshall"]
tokens = [processText(text, lemma=True) for text in test_sub] #1-gram
# tokens = [[' '.join(i) for i in processText(text, lemma=True, gram=2)] for text in test_sub] #2-gram
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens)]

100%|██████████| 9/9 [00:00<00:00, 14266.34it/s]


In [3]:
tokens

[['name', 'marshall'],
 ['today'],
 ['alex', 'bad', 'guy'],
 ['marshall', 'happy', 'name'],
 ['alex', 'name'],
 ['alex', 'really', 'bad'],
 ['happy', 'marshall', 'say'],
 ['bad', 'alex', 'bad'],
 ['bad', 'guy', 'alex', 'finally', 'meet', 'happy', 'marshall']]

# LDA

In [86]:
# 普通LDA 最优参数
# Passes 越高，越cluster之间差异对比越明显，15比较稳定
# alpha: auto-自动学习非对称的分布, asymmetric非对称先验(比较容易聚集类)，默认symmetric对称先验比较靠谱

NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,alpha='symmetric')
topics = ldamodel.print_topics(num_words=30)
for topic in topics:
    print(topic)

(0, '0.286*"marshall" + 0.197*"name" + 0.196*"happy" + 0.107*"say" + 0.107*"today" + 0.018*"alex" + 0.018*"bad" + 0.018*"really" + 0.018*"guy" + 0.018*"meet" + 0.018*"finally"')
(1, '0.180*"guy" + 0.180*"alex" + 0.180*"bad" + 0.098*"finally" + 0.098*"meet" + 0.098*"happy" + 0.098*"marshall" + 0.016*"today" + 0.016*"name" + 0.016*"say" + 0.016*"really"')
(2, '0.091*"today" + 0.091*"name" + 0.091*"alex" + 0.091*"bad" + 0.091*"marshall" + 0.091*"happy" + 0.091*"say" + 0.091*"really" + 0.091*"guy" + 0.091*"finally" + 0.091*"meet"')
(3, '0.091*"today" + 0.091*"name" + 0.091*"alex" + 0.091*"bad" + 0.091*"marshall" + 0.091*"happy" + 0.091*"say" + 0.091*"really" + 0.091*"guy" + 0.091*"meet" + 0.091*"finally"')
(4, '0.314*"bad" + 0.314*"alex" + 0.118*"really" + 0.117*"name" + 0.020*"today" + 0.020*"marshall" + 0.020*"happy" + 0.020*"say" + 0.020*"guy" + 0.020*"finally" + 0.020*"meet"')


In [87]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Multi-core

In [94]:
# 多线程加速，alpha没有auto,选用默认的symmetric，效果近似传统方法
NUM_TOPICS = 5
ldamodel =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10,random_state=0,workers=4,alpha='symmetric')
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.286*"marshall" + 0.197*"name" + 0.196*"happy" + 0.107*"say" + 0.107*"today" + 0.018*"alex" + 0.018*"bad" + 0.018*"guy" + 0.018*"really" + 0.018*"meet"')
(1, '0.225*"alex" + 0.155*"guy" + 0.155*"bad" + 0.085*"meet" + 0.085*"finally" + 0.085*"happy" + 0.084*"marshall" + 0.084*"name" + 0.014*"today" + 0.014*"say"')
(2, '0.091*"today" + 0.091*"name" + 0.091*"alex" + 0.091*"bad" + 0.091*"marshall" + 0.091*"happy" + 0.091*"say" + 0.091*"guy" + 0.091*"finally" + 0.091*"meet"')
(3, '0.231*"alex" + 0.231*"really" + 0.230*"bad" + 0.038*"name" + 0.038*"today" + 0.038*"marshall" + 0.038*"happy" + 0.038*"say" + 0.038*"guy" + 0.038*"meet"')
(4, '0.424*"bad" + 0.231*"alex" + 0.038*"name" + 0.038*"today" + 0.038*"marshall" + 0.038*"happy" + 0.038*"say" + 0.038*"guy" + 0.038*"really" + 0.038*"finally"')


In [89]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# Functions related to Topic Modelling

In [43]:
pd.DataFrame([(i,dictionary.cfs[k]) for k,i in dictionary.iteritems()]).sort_values(by=1,ascending=False).T

Unnamed: 0,3,4,0,1,6,5,2,7,8,9,10
0,alex,bad,marshall,name,happy,guy,today,really,say,finally,meet
1,5,5,4,3,3,2,1,1,1,1,1


In [9]:
# 显示每个Topic的词向量表示，显示具体词语，默认显示Top20
ldamodel.print_topics()

[(0,
  '0.263*"marshall" + 0.228*"name" + 0.185*"happy" + 0.092*"say" + 0.091*"today" + 0.024*"alex" + 0.024*"meet" + 0.024*"finally" + 0.023*"guy" + 0.023*"bad"'),
 (1,
  '0.269*"bad" + 0.268*"alex" + 0.118*"guy" + 0.067*"really" + 0.067*"finally" + 0.067*"meet" + 0.049*"happy" + 0.043*"marshall" + 0.018*"name" + 0.017*"today"'),
 (2,
  '0.093*"today" + 0.091*"name" + 0.091*"alex" + 0.091*"marshall" + 0.091*"really" + 0.091*"happy" + 0.091*"bad" + 0.090*"guy" + 0.090*"say" + 0.090*"finally"')]

In [10]:
# 得到每个Topic的词向量表示
ldamodel.get_topics()

array([[0.26324803, 0.2275979 , 0.09083208, 0.02428105, 0.02322719,
        0.02336232, 0.18528856, 0.02308698, 0.09169419, 0.02369042,
        0.02369124],
       [0.04274272, 0.01822507, 0.01696878, 0.26838884, 0.2692721 ,
        0.1175331 , 0.04933495, 0.06707782, 0.01688803, 0.06678449,
        0.06678429],
       [0.090932  , 0.09135655, 0.09327401, 0.09126303, 0.0906753 ,
        0.09036507, 0.09082185, 0.09084722, 0.09035214, 0.09005748,
        0.09005538]], dtype=float32)

In [11]:
# 查询某一个Topic 所构成的words的所占比例
ldamodel.get_topic_terms(0)

[(0, 0.26324806),
 (1, 0.22759792),
 (6, 0.18528858),
 (8, 0.0916942),
 (2, 0.090832084),
 (3, 0.024281053),
 (10, 0.02369124),
 (9, 0.023690425),
 (5, 0.023362326),
 (4, 0.023227187)]

In [12]:
# 查询字典中某个字，在模型中距离每个Topic的距离位置
dictionary[0],ldamodel.get_term_topics(0)

('marshall', [(0, 0.23775995), (1, 0.021836447), (2, 0.014256414)])

In [13]:
# 额外的corpus
new_text = ['hi, marshall',"my name is marshall",'happy day','bad people','hi guys']
new_tokens = [processText(text, lemma=True) for text in new_text]
other_corpus = [dictionary.doc2bow(text) for text in new_tokens]
other_corpus

[[(0, 1)], [(0, 1), (1, 1)], [(6, 1)], [(4, 1)], [(5, 1)]]

In [14]:
# 额外corpus 的doc在模型中距离每个Topic的距离位置
for i in other_corpus:
    print(ldamodel[i]) # .get_document_topics 简写

[(0, 0.6627141), (1, 0.16907607), (2, 0.16820979)]
[(0, 0.7758506), (1, 0.11196737), (2, 0.11218204)]
[(0, 0.6594241), (1, 0.1715518), (2, 0.16902407)]
[(0, 0.16698891), (1, 0.6648994), (2, 0.16811173)]
[(0, 0.16753846), (1, 0.66180587), (2, 0.17065568)]


In [15]:
# 将额外的corpus更新到模型中
ldamodel.update(other_corpus)

In [16]:
# 两个Topic models 差异
ldamodel.diff(ldamodel,annotation=True,normed=True)

(array([[0.        , 0.88177261, 0.39799445],
        [1.        , 0.        , 0.33821746],
        [0.41967284, 0.29940789, 0.        ]]),
 array([[list([['guy', 'bad', 'meet', 'name', 'happy', 'really', 'finally', 'marshall', 'alex', 'today'], []]),
         list([['guy', 'bad', 'meet', 'happy', 'name', 'really', 'finally', 'marshall', 'alex', 'today'], []]),
         list([['guy', 'bad', 'meet', 'happy', 'name', 'really', 'finally', 'marshall', 'alex', 'today'], []])],
        [list([['guy', 'bad', 'meet', 'name', 'happy', 'really', 'finally', 'marshall', 'alex', 'today'], []]),
         list([['guy', 'bad', 'meet', 'happy', 'name', 'really', 'finally', 'marshall', 'alex', 'today'], []]),
         list([['guy', 'bad', 'meet', 'happy', 'name', 'really', 'finally', 'marshall', 'alex', 'today'], []])],
        [list([['guy', 'bad', 'meet', 'name', 'happy', 'really', 'finally', 'marshall', 'alex', 'today'], []]),
         list([['guy', 'bad', 'meet', 'happy', 'name', 'really', 'finally'

In [17]:
ldamodel.bound(other_corpus)

-22.18306356759244