In [1]:
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from tqdm import tqdm
from nltk.corpus import stopwords
stopwordEn = stopwords.words('english')
from nltk.corpus import wordnet
import pickle
import multiprocessing as mp
import gensim
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from gensim.test.utils import datapath
import pyLDAvis.gensim
import time

print('CPU numbers:',mp.cpu_count())
def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)
def apply_by_multiprocessing(df, func, **kwargs):
#     print(kwargs)
    workers = kwargs.pop('workers')
    pool = mp.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))
#apply_by_multiprocessing(fullset['Text'], process_text, workers=cores)
def lemmaWord(word):
    lemma = wordnet.morphy(word)
    if lemma is not None:
        return lemma
    else:
        return word
def processText(text,lemma=False, gram=1):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b|@\w+|#', '', text, flags=re.MULTILINE) #delete URL, # , and @xxx
#     text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    tokens = word_tokenize(text)
#     whitelist = []
    whitelist = ["n't", "not", "no"]
    tokens = [lemmaWord(i.lower()) if lemma else i.lower() for i in tokens if (i.lower() not in stopwordEn or i.lower() in whitelist) and i.isalpha()]
    if gram<=1:
        return tokens
    else:
        return [' '.join(i) for i in nltk.ngrams(tokens, gram)]

CPU numbers: 32


# Load Data

In [2]:
dates = range(23,30)
dfs_news, dfs_tweets={},{}

for d in tqdm(dates):
    with open('./IEEE_news/df_03{}_news.pickle'.format(d), 'rb') as handle:
        dfs_news[d] = pickle.load(handle)
    with open('./IEEE_tweets/df_03{}_tweets.pickle'.format(d), 'rb') as handle:
        dfs_tweets[d] = pickle.load(handle)

100%|██████████| 7/7 [00:41<00:00,  5.86s/it]


# Subset (lemma)

In [3]:
sampling_rate_news = 0.01
sampling_rate_tweets = 0.0635771

news_subset,tweets_subset = [],[]
for d in dates:
    unique_news = dfs_news[d].text.unique()
    retweets = dfs_tweets[d][dfs_tweets[d]['re_full_text']!='']['re_full_text'] # 转发的原文
    orig_tweets = dfs_tweets[d][dfs_tweets[d]['re_full_text']=='']['full_text'] #原创的且未在这个时间段被转发过
    unique_tweets = pd.concat([retweets,orig_tweets]).unique()
    
    np.random.seed(1)
    news_subset.extend(np.random.choice(unique_news, size=int(len(unique_news)*sampling_rate_news), replace=False))
    np.random.seed(1)
    tweets_subset.extend(np.random.choice(unique_tweets, size=int(len(unique_tweets)*sampling_rate_tweets), replace=False))
    
print('subset:',len(news_subset),len(tweets_subset))

subset: 6583 147223


In [123]:
# 2gram
tokens_news = apply_by_multiprocessing(pd.Series(news_subset), processText, workers=32, lemma=True, gram=2)
tokens_tweets = apply_by_multiprocessing(pd.Series(tweets_subset), processText, workers=32, lemma=True, gram=2)

In [7]:
tokens_news = apply_by_multiprocessing(pd.Series(news_subset), processText, workers=32, lemma=True)
tokens_tweets = apply_by_multiprocessing(pd.Series(tweets_subset), processText, workers=32, lemma=True)

In [8]:
(sum([len(i) for i in tokens_news])),(sum([len(i) for i in tokens_tweets]))

(1805923, 1805755)

In [9]:
(sum([len(i) for i in tokens_news]))/(sum([len(i) for i in tokens_tweets]))

1.00009303587696

# Tweets

In [60]:
from gensim import corpora
dictionary = corpora.Dictionary(tokens_tweets)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens_tweets)]

100%|██████████| 147223/147223 [00:02<00:00, 54444.35it/s]


## Multicore LDA Test (20-Cluster)

#### Asymmetric 15-pass

In [12]:
# asymmetric，15 passes 301 s
start = time.time()
NUM_TOPICS = 20
ldamodel_asy =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=1,random_state=0,workers=30,alpha='asymmetric')
topics = ldamodel_asy.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.097*"corona" + 0.034*"virus" + 0.008*"people" + 0.008*"time" + 0.008*"get"')
(1, '0.063*"corona" + 0.039*"case" + 0.025*"death" + 0.019*"total" + 0.018*"update"')
(2, '0.058*"corona" + 0.014*"people" + 0.009*"virus" + 0.007*"week" + 0.007*"right"')
(3, '0.094*"corona" + 0.027*"get" + 0.026*"virus" + 0.016*"go" + 0.012*"like"')
(4, '0.051*"corona" + 0.017*"money" + 0.017*"people" + 0.016*"virus" + 0.014*"news"')
(5, '0.051*"corona" + 0.017*"people" + 0.017*"u" + 0.016*"amp" + 0.013*"virus"')
(6, '0.065*"corona" + 0.014*"soon" + 0.010*"well" + 0.009*"one" + 0.008*"go"')
(7, '0.048*"corona" + 0.009*"good" + 0.009*"virus" + 0.006*"thread" + 0.005*"cure"')
(8, '0.073*"corona" + 0.046*"virus" + 0.020*"test" + 0.013*"positive" + 0.008*"say"')
(9, '0.064*"corona" + 0.059*"virus" + 0.035*"world" + 0.031*"china" + 0.014*"people"')
(10, '0.048*"corona" + 0.017*"virus" + 0.015*"us" + 0.009*"people" + 0.008*"government"')
(11, '0.049*"corona" + 0.031*"ji" + 0.023*"god" + 0.021*"india" + 0.02

In [13]:
# 15-pass
lda_display_asy = pyLDAvis.gensim.prepare(ldamodel_asy, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_asy)

#### Symmetric_15-pass

In [20]:
# 默认的symmetric 对称  1个pass 303s
start = time.time()
NUM_TOPICS = 20
ldamodel_sy =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_sy.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.073*"corona" + 0.030*"virus" + 0.015*"doctor" + 0.009*"mask" + 0.009*"people"')
(1, '0.064*"case" + 0.059*"corona" + 0.037*"death" + 0.031*"total" + 0.029*"update"')
(2, '0.056*"corona" + 0.012*"boris" + 0.010*"people" + 0.008*"virus" + 0.007*"johnson"')
(3, '0.099*"corona" + 0.028*"get" + 0.027*"virus" + 0.015*"go" + 0.013*"like"')
(4, '0.060*"corona" + 0.018*"time" + 0.017*"news" + 0.015*"virus" + 0.013*"money"')
(5, '0.055*"corona" + 0.020*"u" + 0.015*"amp" + 0.014*"people" + 0.012*"virus"')
(6, '0.085*"corona" + 0.015*"soon" + 0.013*"get" + 0.012*"well" + 0.011*"lol"')
(7, '0.067*"corona" + 0.010*"virus" + 0.009*"place" + 0.008*"visit" + 0.007*"pandemic"')
(8, '0.084*"corona" + 0.053*"virus" + 0.018*"test" + 0.012*"positive" + 0.010*"say"')
(9, '0.065*"corona" + 0.054*"virus" + 0.033*"world" + 0.024*"china" + 0.016*"people"')
(10, '0.051*"corona" + 0.023*"us" + 0.018*"virus" + 0.009*"please" + 0.008*"people"')
(11, '0.049*"corona" + 0.028*"god" + 0.024*"ji" + 0.018*"virus" +

In [11]:
#  15-passes
lda_display_sy = pyLDAvis.gensim.prepare(ldamodel_sy, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_sy)

In [22]:
ldamodel_sy.save('./TopicModels/ldamodel_sy_tweets.model')
with open('./TopicModels/lda_display_sy_tweets.pickle', 'wb') as handle:
    pickle.dump(lda_display_sy, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### 2G_Asymmetric_15-pass

In [45]:
# 2-g asymmetric，15 passes 301 s
start = time.time()
NUM_TOPICS = 20
ldamodel_asy =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='asymmetric')
topics = ldamodel_asy.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.035*"corona virus" + 0.006*"stay home" + 0.005*"fight corona" + 0.003*"gon na" + 0.003*"corona shit"')
(1, '0.030*"corona virus" + 0.010*"get corona" + 0.001*"corona coronavirusoutbreak" + 0.001*"corona get" + 0.001*"chinese virus"')
(2, '0.049*"corona virus" + 0.005*"test positive" + 0.005*"positive corona" + 0.002*"fight corona" + 0.002*"corona case"')
(3, '0.015*"corona virus" + 0.003*"due corona" + 0.001*"corona crisis" + 0.001*"fight corona" + 0.001*"like corona"')
(4, '0.019*"corona virus" + 0.003*"new case" + 0.003*"stayhome staysafe" + 0.002*"new death" + 0.002*"case coronavirus"')
(5, '0.006*"corona update" + 0.005*"total death" + 0.005*"case total" + 0.005*"active case" + 0.005*"total recover"')
(6, '0.006*"corona virus" + 0.003*"relief fund" + 0.003*"corona relief" + 0.001*"fight corona" + 0.001*"cm relief"')
(7, '0.003*"corona virus" + 0.001*"fight corona" + 0.001*"corona stop" + 0.001*"symptom take" + 0.001*"please support"')
(8, '0.006*"corona virus" + 0.002*"coron

In [46]:
lda_display_asy = pyLDAvis.gensim.prepare(ldamodel_asy, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_asy)

In [113]:
with open('./TopicModels/2g/ldamodel_asy_2g.pickle', 'wb') as handle:
    pickle.dump(ldamodel_asy, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./TopicModels/2g/lda_display_2g_asy.pickle', 'wb') as handle:
    pickle.dump(lda_display_asy, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
# 2-g asy 1-pass
lda_display_asy = pyLDAvis.gensim.prepare(ldamodel_asy, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_asy)

#### 2G_Symmetric_15-pass

In [47]:
# symmetric，15 passes 301 s
start = time.time()
NUM_TOPICS = 20
ldamodel_sy_2g =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_sy_2g.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.007*"corona virus" + 0.001*"corona situation" + 0.001*"get better" + 0.001*"worst corona" + 0.001*"first corona"')
(1, '0.011*"corona virus" + 0.003*"corona coronavirusoutbreak" + 0.002*"rt follow" + 0.002*"coronavirus モンストフェアリーテイル" + 0.002*"prepper survival"')
(2, '0.034*"corona virus" + 0.009*"test positive" + 0.008*"positive corona" + 0.004*"via youtube" + 0.003*"catch corona"')
(3, '0.016*"corona virus" + 0.007*"fuck corona" + 0.002*"corona crisis" + 0.002*"corona patient" + 0.001*"due corona"')
(4, '0.027*"corona virus" + 0.016*"stay home" + 0.010*"stay safe" + 0.004*"home stay" + 0.003*"fight corona"')
(5, '0.006*"corona time" + 0.006*"corona update" + 0.006*"corona virus" + 0.005*"total death" + 0.005*"case total"')
(6, '0.012*"corona virus" + 0.007*"fight corona" + 0.004*"relief fund" + 0.003*"corona relief" + 0.001*"india fight"')
(7, '0.006*"corona virus" + 0.005*"corona shit" + 0.003*"fight corona" + 0.002*"due corona" + 0.002*"wan na"')
(8, '0.019*"corona virus" + 0.

In [48]:
lda_display_sy_2g = pyLDAvis.gensim.prepare(ldamodel_sy_2g, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_sy_2g)

In [123]:
with open('./TopicModels/2g/ldamodel_sy_2g.pickle', 'wb') as handle:
    pickle.dump(ldamodel_sy_2g, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./TopicModels/2g/lda_display_2g_sy.pickle', 'wb') as handle:
    pickle.dump(lda_display_sy_2g, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Original 12-cluster LDA

In [15]:
# original auto
start = time.time()
NUM_TOPICS = 12
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.062*"corona" + 0.030*"time" + 0.020*"love" + 0.015*"allah" + 0.014*"may"')
(1, '0.064*"case" + 0.049*"corona" + 0.048*"death" + 0.037*"coronavirus" + 0.031*"patient"')
(2, '0.037*"via" + 0.034*"video" + 0.029*"corona" + 0.025*"thanks" + 0.024*"check"')
(3, '0.113*"corona" + 0.031*"get" + 0.028*"virus" + 0.018*"go" + 0.016*"like"')
(4, '0.028*"news" + 0.027*"hands" + 0.027*"mask" + 0.023*"police" + 0.019*"fake"')
(5, '0.064*"corona" + 0.029*"people" + 0.025*"amp" + 0.018*"virus" + 0.017*"take"')
(6, '0.029*"food" + 0.027*"corona" + 0.016*"donation" + 0.014*"coronavirusoutbreak" + 0.014*"coronavirus"')
(7, '0.078*"stay" + 0.076*"home" + 0.066*"corona" + 0.036*"safe" + 0.030*"lockdown"')
(8, '0.074*"corona" + 0.033*"virus" + 0.031*"test" + 0.019*"u" + 0.018*"positive"')
(9, '0.119*"corona" + 0.084*"virus" + 0.035*"world" + 0.024*"china" + 0.017*"country"')
(10, '0.055*"corona" + 0.021*"help" + 0.017*"money" + 0.016*"virus" + 0.016*"fuck"')
(11, '0.054*"narendramodi" + 0.046*"india"

In [16]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

# News

In [126]:
from gensim import corpora
dictionary = corpora.Dictionary(tokens_news)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens_news)]

100%|██████████| 6583/6583 [00:01<00:00, 3440.18it/s]


### 12-Cluster-Sym-LDA-15p

In [246]:
import gensim
NUM_TOPICS = 12
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.011*"people" + 0.008*"one" + 0.008*"like" + 0.008*"get" + 0.008*"time"')
(1, '0.017*"game" + 0.014*"season" + 0.011*"player" + 0.011*"sport" + 0.011*"team"')
(2, '0.011*"online" + 0.011*"email" + 0.011*"information" + 0.010*"student" + 0.009*"school"')
(3, '0.020*"say" + 0.011*"march" + 0.009*"home" + 0.008*"coronavirus" + 0.007*"close"')
(4, '0.026*"say" + 0.017*"state" + 0.016*"new" + 0.014*"coronavirus" + 0.010*"trump"')
(5, '0.021*"say" + 0.019*"coronavirus" + 0.015*"case" + 0.013*"people" + 0.011*"government"')
(6, '0.016*"china" + 0.010*"country" + 0.010*"iran" + 0.010*"trump" + 0.009*"chinese"')
(7, '0.027*"virus" + 0.017*"drug" + 0.016*"disease" + 0.016*"patient" + 0.012*"study"')
(8, '0.017*"health" + 0.009*"public" + 0.007*"say" + 0.007*"test" + 0.006*"include"')
(9, '0.011*"business" + 0.010*"company" + 0.008*"market" + 0.007*"say" + 0.006*"million"')
(10, '0.032*"hospital" + 0.023*"patient" + 0.019*"mask" + 0.018*"medical" + 0.017*"say"')
(11, '0.067*"de" + 0.046*"la

In [247]:
# 5-word 12 topics, default alpha, 15 passes
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

### 20-Cluster-Sym-Mul-15p

In [25]:
# 默认的symmetric 对称  15个pass 303s
start = time.time()
NUM_TOPICS = 20
ldamodel_news =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_news.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.006*"d" + 0.006*"news" + 0.005*"die" + 0.005*"sa" + 0.005*"iha"')
(1, '0.020*"say" + 0.008*"people" + 0.006*"trump" + 0.006*"coronavirus" + 0.006*"would"')
(2, '0.006*"drug" + 0.006*"season" + 0.006*"million" + 0.005*"game" + 0.005*"team"')
(3, '0.007*"coronavirus" + 0.005*"new" + 0.005*"time" + 0.005*"school" + 0.005*"march"')
(4, '0.017*"say" + 0.013*"health" + 0.013*"state" + 0.012*"county" + 0.010*"test"')
(5, '0.021*"say" + 0.016*"case" + 0.015*"coronavirus" + 0.014*"new" + 0.012*"people"')
(6, '0.008*"coronavirus" + 0.008*"bill" + 0.007*"vote" + 0.007*"say" + 0.006*"house"')
(7, '0.009*"season" + 0.009*"march" + 0.009*"say" + 0.007*"game" + 0.007*"player"')
(8, '0.011*"business" + 0.009*"say" + 0.007*"market" + 0.006*"government" + 0.005*"coronavirus"')
(9, '0.010*"march" + 0.007*"coronavirus" + 0.007*"fullscreen" + 0.007*"democrat" + 0.006*"chronicle"')
(10, '0.010*"coronavirus" + 0.008*"people" + 0.008*"say" + 0.007*"get" + 0.006*"mask"')
(11, '0.011*"say" + 0.009*"time"

In [26]:
#  15-passes
lda_display_news = pyLDAvis.gensim.prepare(ldamodel_news, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_news)

In [27]:
ldamodel_news.save('./TopicModels/ldamodel_news.model')
with open('./TopicModels/lda_display_news.pickle', 'wb') as handle:
    pickle.dump(lda_display_news, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 2G_20C-Sym-Mul-15p

In [133]:
# 2-g symmetric，15 passes
start = time.time()
NUM_TOPICS = 20
ldamodel_sy_2g_news =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_sy_2g_news.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.001*"per cent" + 0.001*"coronavirus pandemic" + 0.001*"new york" + 0.001*"hedge fund" + 0.001*"hedge funds"')
(1, '0.001*"new york" + 0.001*"coronavirus pandemic" + 0.001*"test positive" + 0.001*"social distance" + 0.001*"rhode island"')
(2, '0.002*"ap content" + 0.002*"content ap" + 0.001*"public health" + 0.001*"coronavirus pandemic" + 0.001*"associate press"')
(3, '0.001*"stay home" + 0.001*"last week" + 0.001*"social distance" + 0.001*"coronavirus pandemic" + 0.000*"test positive"')
(4, '0.001*"new york" + 0.001*"coronavirus pandemic" + 0.001*"social distance" + 0.001*"test positive" + 0.001*"stay home"')
(5, '0.001*"public health" + 0.001*"social distance" + 0.001*"coronavirus pandemic" + 0.001*"stay home" + 0.001*"confirm case"')
(6, '0.001*"social distance" + 0.001*"test positive" + 0.001*"new york" + 0.001*"public health" + 0.000*"coronavirus pandemic"')
(7, '0.001*"coronavirus pandemic" + 0.001*"new york" + 0.000*"tuesday march" + 0.000*"oakridge elementary" + 0.000*"co

In [130]:
lda_display_sy_2g_news = pyLDAvis.gensim.prepare(ldamodel_sy_2g_news, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_sy_2g_news)

In [131]:
with open('./TopicModels/2g/ldamodel_sy_2g_news.pickle', 'wb') as handle:
    pickle.dump(ldamodel_sy_2g_news, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./TopicModels/2g/lda_display_sy_2g_news.pickle', 'wb') as handle:
    pickle.dump(lda_display_sy_2g_news, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 2G_20C-Asym-Mul-15p

In [134]:
# 2-g asymmetric，15 passes
start = time.time()
NUM_TOPICS = 20
ldamodel_asy_2g_news =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='asymmetric')
topics = ldamodel_asy_2g_news.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.003*"new york" + 0.002*"social distance" + 0.002*"public health" + 0.001*"stay home" + 0.001*"coronavirus pandemic"')
(1, '0.002*"new york" + 0.001*"test positive" + 0.001*"coronavirus pandemic" + 0.001*"social distance" + 0.001*"small business"')
(2, '0.002*"ap content" + 0.002*"content ap" + 0.001*"coronavirus pandemic" + 0.001*"public health" + 0.001*"stay home"')
(3, '0.001*"stay home" + 0.001*"social distance" + 0.001*"test positive" + 0.001*"last week" + 0.001*"coronavirus pandemic"')
(4, '0.001*"new york" + 0.001*"social distance" + 0.001*"coronavirus pandemic" + 0.001*"test positive" + 0.001*"stay home"')
(5, '0.001*"public health" + 0.001*"social distance" + 0.001*"coronavirus pandemic" + 0.001*"stay home" + 0.000*"test positive"')
(6, '0.001*"social distance" + 0.001*"test positive" + 0.000*"coronavirus pandemic" + 0.000*"new york" + 0.000*"public health"')
(7, '0.001*"coronavirus pandemic" + 0.001*"oakridge elementary" + 0.001*"tuesday march" + 0.000*"school tuesday" 

In [135]:
lda_display_asy_2g_news = pyLDAvis.gensim.prepare(ldamodel_asy_2g_news, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_asy_2g_news)

In [136]:
with open('./TopicModels/2g/ldamodel_asy_2g_news.pickle', 'wb') as handle:
    pickle.dump(ldamodel_asy_2g_news, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./TopicModels/2g/lda_display_asy_2g_news.pickle', 'wb') as handle:
    pickle.dump(lda_display_asy_2g_news, handle, protocol=pickle.HIGHEST_PROTOCOL)

# News + Tweets

In [10]:
tokens = list(pd.concat([tokens_tweets,tokens_news]))
np.random.seed(1)
np.random.shuffle(tokens)

In [11]:
from gensim import corpora
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens)]
with open('./TopicModels/dictionary_all.pickle', 'wb') as handle:
    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
len(dictionary.keys())

140570

### 12C-Sym-LDA-15p

In [250]:
import gensim
NUM_TOPICS = 12
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.031*"say" + 0.011*"state" + 0.011*"coronavirus" + 0.011*"march" + 0.009*"home"')
(1, '0.012*"time" + 0.010*"video" + 0.009*"game" + 0.009*"new" + 0.009*"cancel"')
(2, '0.145*"corona" + 0.041*"get" + 0.027*"go" + 0.026*"virus" + 0.016*"u"')
(3, '0.256*"corona" + 0.151*"virus" + 0.012*"news" + 0.012*"distance" + 0.011*"hands"')
(4, '0.044*"via" + 0.035*"police" + 0.022*"youtube" + 0.019*"app" + 0.017*"boris"')
(5, '0.057*"corona" + 0.043*"china" + 0.038*"world" + 0.023*"virus" + 0.020*"chinese"')
(6, '0.013*"pay" + 0.012*"bill" + 0.011*"market" + 0.011*"million" + 0.010*"business"')
(7, '0.032*"people" + 0.021*"like" + 0.017*"know" + 0.017*"get" + 0.015*"would"')
(8, '0.099*"corona" + 0.034*"india" + 0.028*"lockdown" + 0.021*"narendramodi" + 0.019*"sir"')
(9, '0.009*"company" + 0.009*"health" + 0.009*"provide" + 0.009*"need" + 0.007*"support"')
(10, '0.032*"case" + 0.027*"coronavirus" + 0.026*"test" + 0.019*"hospital" + 0.019*"death"')
(11, '0.099*"corona" + 0.038*"virus" + 0.035*

In [251]:
# 5-word 12 topics
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

### 20C-Sym-Mul-15p

In [30]:
# 默认的symmetric 对称  15个pass 303s
start = time.time()
NUM_TOPICS = 20
ldamodel_all =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_all.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.123*"corona" + 0.062*"virus" + 0.009*"coronavirus" + 0.008*"cure" + 0.005*"stay"')
(1, '0.072*"corona" + 0.028*"virus" + 0.017*"people" + 0.011*"world" + 0.011*"na"')
(2, '0.042*"corona" + 0.021*"virus" + 0.013*"hands" + 0.008*"mask" + 0.008*"wash"')
(3, '0.015*"say" + 0.011*"health" + 0.010*"coronavirus" + 0.009*"people" + 0.007*"government"')
(4, '0.043*"corona" + 0.036*"virus" + 0.023*"china" + 0.022*"trump" + 0.011*"realdonaldtrump"')
(5, '0.007*"business" + 0.007*"company" + 0.007*"say" + 0.006*"market" + 0.005*"coronavirus"')
(6, '0.052*"case" + 0.046*"corona" + 0.037*"death" + 0.026*"total" + 0.024*"update"')
(7, '0.035*"corona" + 0.005*"rt" + 0.005*"us" + 0.004*"n" + 0.004*"virus"')
(8, '0.054*"corona" + 0.016*"virus" + 0.009*"due" + 0.007*"hai" + 0.004*"se"')
(9, '0.019*"corona" + 0.012*"test" + 0.011*"virus" + 0.009*"people" + 0.009*"patient"')
(10, '0.101*"corona" + 0.033*"get" + 0.024*"virus" + 0.014*"like" + 0.010*"shit"')
(11, '0.017*"say" + 0.013*"state" + 0.010*"

In [31]:
#  15-passes
lda_display_all = pyLDAvis.gensim.prepare(ldamodel_all, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_all)

In [32]:
ldamodel_all.save(datapath('./TopicModels/ldamodel_all.model'))
with open('./TopicModels/lda_display_all.pickle', 'wb') as handle:
    pickle.dump(lda_display_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 2G_20C-Sym-Mul-15p_All

In [139]:
# symmetric 对称  15个pass 
start = time.time()
NUM_TOPICS = 20
ldamodel_sy_2g_all =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_sy_2g_all.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.002*"corona virus" + 0.002*"corona say" + 0.001*"fight corona" + 0.001*"berkaitan pada" + 0.001*"papar berkaitan"')
(1, '0.008*"corona virus" + 0.002*"corona pandemic" + 0.001*"thanks corona" + 0.001*"visit corona" + 0.001*"breaking news"')
(2, '0.002*"corona virus" + 0.001*"spread coronavirus" + 0.001*"corona coronavirusoutbreak" + 0.001*"curbside pickup" + 0.001*"rt follow"')
(3, '0.001*"corona virus" + 0.001*"corona baby" + 0.001*"corona check" + 0.001*"fight corona" + 0.001*"social distance"')
(4, '0.004*"corona virus" + 0.004*"corona time" + 0.001*"coronavirus corona" + 0.001*"fight corona" + 0.001*"corona real"')
(5, '0.005*"fuck corona" + 0.002*"corona virus" + 0.001*"corona beer" + 0.001*"corona fuck" + 0.001*"news weather"')
(6, '0.003*"corona virus" + 0.002*"cure corona" + 0.001*"corona shit" + 0.001*"march cancel" + 0.001*"zacks rank"')
(7, '0.074*"corona virus" + 0.007*"stay home" + 0.005*"stay safe" + 0.003*"due corona" + 0.003*"spread corona"')
(8, '0.011*"corona v

In [140]:
lda_display_sy_2g_all = pyLDAvis.gensim.prepare(ldamodel_sy_2g_all, corpus, dictionary, sort_topics=False)
with open('./TopicModels/2g/ldamodel_sy_2g_all.pickle', 'wb') as handle:
    pickle.dump(ldamodel_sy_2g_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./TopicModels/2g/lda_display_sy_2g_all.pickle', 'wb') as handle:
    pickle.dump(lda_display_sy_2g_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
pyLDAvis.display(lda_display_sy_2g_all)

### 2G_20C-Asym-Mul-15p_All

In [141]:
# asymmetric 非对称  15个pass 
start = time.time()
NUM_TOPICS = 20
ldamodel_asy_2g_all =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='asymmetric')
topics = ldamodel_asy_2g_all.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.032*"corona virus" + 0.004*"get corona" + 0.004*"stay home" + 0.002*"gon na" + 0.002*"fight corona"')
(1, '0.023*"corona virus" + 0.003*"fight corona" + 0.002*"due corona" + 0.002*"stay home" + 0.002*"corona pandemic"')
(2, '0.003*"new york" + 0.002*"coronavirus pandemic" + 0.002*"public health" + 0.002*"social distance" + 0.002*"last week"')
(3, '0.002*"health care" + 0.002*"social distance" + 0.001*"public health" + 0.001*"confirm case" + 0.001*"corona virus"')
(4, '0.002*"corona virus" + 0.001*"infectious disease" + 0.001*"viral threat" + 0.001*"metre apart" + 0.001*"environmental health"')
(5, '0.001*"corona virus" + 0.001*"per cent" + 0.001*"product line" + 0.000*"fight corona" + 0.000*"mild disease"')
(6, '0.004*"corona update" + 0.004*"total death" + 0.004*"case total" + 0.004*"active case" + 0.004*"total recover"')
(7, '0.006*"corona virus" + 0.000*"due corona" + 0.000*"new york" + 0.000*"biological warfare" + 0.000*"virus wake"')
(8, '0.003*"corona virus" + 0.001*"new o

In [143]:
lda_display_asy_2g_all = pyLDAvis.gensim.prepare(ldamodel_asy_2g_all, corpus, dictionary, sort_topics=False)
with open('./TopicModels/2g/ldamodel_asy_2g_all.pickle', 'wb') as handle:
    pickle.dump(ldamodel_asy_2g_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./TopicModels/2g/lda_display_asy_2g_all.pickle', 'wb') as handle:
    pickle.dump(lda_display_asy_2g_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
pyLDAvis.display(lda_display_asy_2g_all)

# Load Model

In [75]:
ldamodel_all= LdaMulticore.load('./TopicModels/ldamodel_all.model')

In [88]:
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens)]
len(corpus),len(tokens)

100%|██████████| 153806/153806 [00:04<00:00, 34226.39it/s]


In [103]:
new_text = dfs_tweets[23].full_text[10]
new_token = processText(new_text)
new_corpus = dictionary.doc2bow(new_token)
new_corpus

[(2, 1),
 (58, 1),
 (931, 1),
 (1376, 1),
 (2175, 1),
 (2742, 1),
 (4285, 1),
 (4796, 1),
 (6044, 1),
 (17952, 1)]

In [102]:
new_token

['rt',
 'holmesjosh',
 'democrat',
 'filibuster',
 'corona',
 'relief',
 'single',
 'irresponsible',
 'act',
 'seen',
 'years',
 'around']

In [104]:
ldamodel_all[new_corpus]

[(4, 0.55283713), (7, 0.11619392), (16, 0.25325277)]

# Sentiment Test
两个数据集中，每一个doc都对应一个1-20维度的向量，表示其与哪个topic更靠近，这个可以作为weight存储下来。
对于每一个doc，tweets按照整个doc，news按照句子级别切割为多个doc，每一个doc计算一个sentiment指数[-1,0,1], 每一个doc最靠近的那个topic得到这个doc的指数，
(也可直接取sentiment为【-1,0】连续概率值，这个值乘以topic vector的weight,分别个多个topic的sentiment值进行更新)
最终统计每个topic有多少个（或者是连续值）什么样sentiment的指数？

- sentiment for all text
- topic vectors as weight

In [80]:
np.array(tokens_tweets[0:10])

array([list(['billion', 'indian', 'let', 'work', 'together', 'best', 'stop', 'virus', 'corona', 'stayhome']),
       list(['abpnews', 'stop', 'corona', 'say', 'go', 'corona', 'whereby', 'ready', 'corona', 'spread', 'yeahnewschannelelectiontaqbetukikhabreinfailayega', 'abhinoelectiongroundnahibananahai', 'sachchaidikhao', 'deshkeliyekaamkro', 'bekarnahikehlaoge', 'channelleduboge']),
       list(['irupnd', 'whole', 'world', 'fighting', 'corona', 'pakistan', 'karuna']),
       list(['sant', 'rampal', 'ji', 'maharaj', 'ji', 'end', 'corona', 'epidemic', 'appeal', 'prime', 'minister', 'india', 'pray', 'saint', 'rampal', 'ji', 'maharaj']),
       list(['corona', 'hand', 'sanitizer', 'get']),
       list(['coronavirus', 'victim', 'say', 'expect', 'die', 'instead', 'say', 'hydroxychloroquine', 'save', 'life', 'via', 'gatewaypundit']),
       list(['qhl', 'penalty', 'kill', 'leaders', 'newmarket', 'brew', 'dog', 'parry', 'sound', 'pine', 'carlton', 'steak', 'wuhan', 'corona', 'lime', 'flamborou

In [27]:
with open('../SentimentAnalysis/tokenizer.pickle', 'rb') as handle:
    t = pickle.load(handle)

In [9]:
from tensorflow.keras.models import save_model,load_model
model_ge20 = load_model('../SentimentAnalysis/models/model_ge20')

In [85]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

text_seq = t.texts_to_sequences(tokens_tweets)
max_length = 21
text_pad = pad_sequences(text_seq, maxlen=max_length, padding='post')

In [128]:
sent_tweets = pd.Series([np.argmax(i) for i in model_ge20.predict(text_pad)])
sent_tweets.value_counts()

0    101611
1     33681
2     11931
dtype: int64

In [127]:
pd.Series(tweets_subset)[list(sent_tweets[sent_tweets==2].index)[:10]]

25     @StefanMolyneux is their image of genetic make...
26             This corona weather is ridiculously nice.
27     They closed the barbershop my dad works at bec...
34     Corona couldn’t wait til April fools? https://...
38     @DarjiRasikbhai @AshokPa72976701 Indian Sages ...
47            omg the emojis???? https://t.co/t98AknR0oi
61     Blessed Monday everyone 🙏 , Corona del Rosario...
98     I love my family, friends &amp; my nation. I a...
104    Okay great! hope you are sha practicing social...
105    GUJRAT STATE NEWS \n\nઆજ રાત ૧૨:૦૦ થી ૩૧/૦૩/૨૦...
dtype: object

In [125]:
tokens_tweets[sent_tweets[sent_tweets==2].index].head(20)

25     [image, genetic, makeup, explain, seeing, comb...
26                 [corona, weather, ridiculously, nice]
27     [close, barbershop, dad, works, barbershop, st...
34                      [corona, wait, til, april, fool]
38     [indian, sage, amp, present, great, yogi, amp,...
47                                         [omg, emojis]
61     [bless, monday, everyone, corona, del, rosario...
98     [love, family, friend, amp, nation, following,...
104    [okay, great, hope, sha, practice, social, dis...
105    [gujrat, state, news, આજ, dgp, thank, much, gu...
107    [really, zone, shame, race, see, following, sa...
131                               [corona, hmmmm, sweet]
143    [surely, not, celebrate, apocalypse, weird, ta...
147    [wish, make, course, financial, accounting, un...
162    [well, wishers, include, first, lady, sent, pr...
177    [guy, mom, big, nerd, ever, meet, giving, coro...
195    [real, nigga, following, twitter, street, happ...
198                       [nice

In [126]:
np.array(tokens_tweets[sent_tweets[sent_tweets==2].index])[:10]

array([list(['image', 'genetic', 'makeup', 'explain', 'seeing', 'combination', 'sars', 'corona', 'hiv', 'genome', 'together']),
       list(['corona', 'weather', 'ridiculously', 'nice']),
       list(['close', 'barbershop', 'dad', 'works', 'barbershop', 'street', 'one', 'catch', 'corona', 'dad', 'no', 'job', 'cause', 'wan', 'na', 'bring', 'client', 'house', 'risk', 'getting', 'us', 'infect']),
       list(['corona', 'wait', 'til', 'april', 'fool']),
       list(['indian', 'sage', 'amp', 'present', 'great', 'yogi', 'amp', 'visionary', 'spiritual', 'power', 'amp', 'theforesightednessofsaints', 'well', 'know', 'whole', 'world', 'sant', 'ravidas', 'composition', 'write', 'hundred', 'years', 'ago', 'prophecy', 'today', 'corona', 'pandemic']),
       list(['omg', 'emojis']),
       list(['bless', 'monday', 'everyone', 'corona', 'del', 'rosario', 'means', 'rosary', 'anothermondayanotheradventue', 'quarentena', 'yomequedoencasa', 'stayathome', 'timeofprayer', 'spreadlove']),
       list(['love

In [97]:
tweets_subset[27]

'They closed the barbershop my dad works at because a barbershop more down street one of them caught corona so now my dad has no job cause he doesn’t wanna bring clients to our house to risk getting us and himself infected'