In [2]:
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from tqdm import tqdm
from nltk.corpus import stopwords
stopwordEn = stopwords.words('english')
from nltk.corpus import wordnet
import pickle
import multiprocessing as mp
import gensim
from gensim import corpora

# import pyLDAvis
# import pyLDAvis.gensim_models as gensim
# pyLDAvis.enable_notebook()

# import pyLDAvis.gensim
import time

print('CPU numbers:',mp.cpu_count())
def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)
def apply_by_multiprocessing(df, func, **kwargs):
#     print(kwargs)
    workers = kwargs.pop('workers')
    pool = mp.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))
#apply_by_multiprocessing(fullset['Text'], process_text, workers=cores)
def lemmaWord(word):
    lemma = wordnet.morphy(word)
    if lemma is not None:
        return lemma
    else:
        return word
def processText(text,lemma=False, gram=1):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    tokens = word_tokenize(text)
    tokens = [lemmaWord(i.lower()) if lemma else i.lower() for i in tokens if i.lower() not in stopwordEn and i.isalpha()]
    if gram<=1:
        return tokens
    else:
        return [i for i in nltk.ngrams(tokens, gram)]

CPU numbers: 32


# Topic Modelling

In [3]:
dates = range(19,31)
dfs_news, dfs_tweets={},{}

# filtered data without lemmatization
for d in tqdm(dates):
    with open('./IEEE_news/filtered_data/df_03{}_news_filtered.pickle'.format(d), 'rb') as handle:
        dfs_news[d] = pickle.load(handle)
    with open('./IEEE_tweets/filtered_data/df_03{}_tweets_filtered.pickle'.format(d), 'rb') as handle:
        dfs_tweets[d] = pickle.load(handle)

100%|██████████| 12/12 [02:59<00:00, 14.92s/it]


In [8]:
len(dfs_news[d])

193

In [13]:
len_news,len_tweets = 0,0
for d in dates:
    len_news += len(dfs_news[d])
    len_tweets += len(dfs_tweets[d])
    print(len(dfs_tweets[d]))
len_news, len_tweets 

61449
247324
270573
292984
304474
299275
269272
275437
430836
238116
90919
149866


(899960, 2930525)

## Subset

subset: 8994 200658

In [3]:
sampling_rate_news = 0.01
sampling_rate_tweets = 0.06847363488 # 2776332/405460

tokens_tweets,tokens_news = [],[]
for d in dates:    
    np.random.seed(1)
    tokens_news.extend(np.random.choice(dfs_news[d].tokens, size=int(len(dfs_news[d])*sampling_rate_news), replace=False))
    np.random.seed(1)
    tokens_tweets.extend(np.random.choice(dfs_tweets[d].tokens, size=int(len(dfs_tweets[d])*sampling_rate_tweets), replace=False))
    
print('subset:',len(tokens_news),len(tokens_tweets))

subset: 8994 200658


In [6]:
tokens_tweets = pd.Series(tokens_tweets)
tokens_news = pd.Series(tokens_news)
del dfs_news, dfs_tweets

In [7]:
(sum([len(i) for i in tokens_tweets]))/(sum([len(i) for i in tokens_news]))

0.9991751706928422

## Tweets

In [31]:
from gensim import corpora
dictionary = corpora.Dictionary(tokens_tweets)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens_tweets)]

100%|██████████| 146612/146612 [00:05<00:00, 24982.71it/s]


In [34]:
# 默认的symmetric 对称  15个pass 303s
start = time.time()
NUM_TOPICS = 20
ldamodel_tweets =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_tweets.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.061*"corona" + 0.036*"virus" + 0.021*"ji" + 0.014*"india" + 0.012*"modi"')
(1, '0.052*"corona" + 0.024*"amp" + 0.021*"coronavirus" + 0.015*"virus" + 0.008*"lockdown"')
(2, '0.051*"corona" + 0.009*"share" + 0.008*"app" + 0.008*"amp" + 0.007*"please"')
(3, '0.058*"corona" + 0.018*"new" + 0.016*"virus" + 0.011*"gt" + 0.008*"ass"')
(4, '0.059*"corona" + 0.044*"cases" + 0.028*"total" + 0.021*"deaths" + 0.020*"coronavirus"')
(5, '0.049*"corona" + 0.046*"not" + 0.021*"people" + 0.019*"please" + 0.015*"virus"')
(6, '0.085*"corona" + 0.036*"na" + 0.021*"gon" + 0.020*"u" + 0.020*"virus"')
(7, '0.057*"corona" + 0.016*"help" + 0.014*"us" + 0.013*"virus" + 0.012*"fight"')
(8, '0.068*"corona" + 0.023*"virus" + 0.017*"get" + 0.013*"time" + 0.011*"would"')
(9, '0.055*"corona" + 0.027*"god" + 0.014*"virus" + 0.012*"us" + 0.011*"people"')
(10, '0.069*"corona" + 0.052*"virus" + 0.012*"death" + 0.012*"people" + 0.010*"china"')
(11, '0.068*"corona" + 0.021*"us" + 0.015*"virus" + 0.013*"coronavirus" 

In [35]:
#  15-passes
lda_display_tweets = pyLDAvis.gensim.prepare(ldamodel_tweets, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_tweets)

## News

In [30]:
from gensim import corpora
dictionary = corpora.Dictionary(tokens_news)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens_news)]

100%|██████████| 8994/8994 [00:02<00:00, 3159.79it/s]


In [31]:
# 默认的symmetric 对称  15个pass 303s
start = time.time()
NUM_TOPICS = 20
ldamodel_news =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_news.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.015*"said" + 0.011*"coronavirus" + 0.010*"people" + 0.010*"government" + 0.008*"lockdown"')
(1, '0.015*"said" + 0.014*"students" + 0.010*"school" + 0.009*"not" + 0.006*"schools"')
(2, '0.016*"coronavirus" + 0.016*"cases" + 0.015*"said" + 0.015*"new" + 0.011*"people"')
(3, '0.008*"not" + 0.007*"people" + 0.007*"time" + 0.007*"like" + 0.006*"said"')
(4, '0.011*"not" + 0.010*"coronavirus" + 0.010*"said" + 0.010*"people" + 0.009*"virus"')
(5, '0.008*"market" + 0.006*"sales" + 0.005*"company" + 0.005*"coronavirus" + 0.004*"price"')
(6, '0.020*"said" + 0.015*"health" + 0.011*"county" + 0.008*"coronavirus" + 0.007*"department"')
(7, '0.013*"said" + 0.008*"coronavirus" + 0.005*"billion" + 0.005*"market" + 0.005*"march"')
(8, '0.007*"not" + 0.005*"people" + 0.005*"us" + 0.005*"like" + 0.005*"coronavirus"')
(9, '0.007*"support" + 0.007*"services" + 0.006*"health" + 0.005*"help" + 0.005*"business"')
(10, '0.015*"said" + 0.009*"not" + 0.009*"trump" + 0.008*"would" + 0.008*"coronavirus"')
(1

In [32]:
import pyLDAvis.gensim
lda_display_news = pyLDAvis.gensim.prepare(ldamodel_news, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_news)

## News + Tweets

In [8]:
tokens = list(pd.concat([tokens_tweets,tokens_news]))
np.random.seed(1)
np.random.shuffle(tokens)

In [9]:
from gensim import corpora
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tqdm(tokens)]

100%|██████████| 209652/209652 [00:08<00:00, 24244.63it/s]


In [12]:
with open('./TopicModels/dictionary_all_filtered.pickle', 'wb') as handle:
    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
start = time.time()
NUM_TOPICS = 20
ldamodel_all =  gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15,random_state=0,workers=30,alpha='symmetric')
topics = ldamodel_all.print_topics(num_words=5)
for topic in topics:
    print(topic)
print(time.time()-start)

(0, '0.075*"corona" + 0.023*"virus" + 0.023*"stay" + 0.016*"fight" + 0.014*"india"')
(1, '0.051*"corona" + 0.010*"virus" + 0.009*"doctors" + 0.008*"us" + 0.005*"not"')
(2, '0.009*"said" + 0.006*"not" + 0.005*"also" + 0.004*"health" + 0.004*"work"')
(3, '0.056*"china" + 0.054*"corona" + 0.044*"virus" + 0.023*"chinese" + 0.018*"world"')
(4, '0.090*"corona" + 0.027*"virus" + 0.014*"not" + 0.013*"get" + 0.013*"like"')
(5, '0.032*"corona" + 0.008*"music" + 0.007*"one" + 0.005*"us" + 0.005*"time"')
(6, '0.085*"corona" + 0.038*"na" + 0.024*"virus" + 0.022*"gon" + 0.013*"go"')
(7, '0.013*"people" + 0.011*"coronavirus" + 0.009*"not" + 0.007*"use" + 0.006*"data"')
(8, '0.071*"corona" + 0.032*"virus" + 0.022*"cases" + 0.016*"total" + 0.016*"deaths"')
(9, '0.051*"corona" + 0.019*"virus" + 0.014*"not" + 0.013*"sir" + 0.012*"people"')
(10, '0.068*"corona" + 0.033*"virus" + 0.019*"people" + 0.019*"us" + 0.018*"no"')
(11, '0.017*"students" + 0.016*"school" + 0.012*"not" + 0.007*"said" + 0.006*"schools

In [14]:
import pyLDAvis.gensim
lda_display_all = pyLDAvis.gensim.prepare(ldamodel_all, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_all)

In [9]:
import pyLDAvis.gensim
lda_display_all = pyLDAvis.gensim.prepare(ldamodel_all, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display_all)

In [15]:
ldamodel_all.save('./TopicModels/ldamodel_all_filtered.model')
with open('./TopicModels/lda_display_all_filtered.pickle', 'wb') as handle:
    pickle.dump(lda_display_all, handle, protocol=pickle.HIGHEST_PROTOCOL)