In [1]:
# prepare env
import os
import json
import warnings

from tqdm import tqdm
import gensim
from gensim.models.wrappers import LdaMallet
import pyLDAvis
import pyLDAvis.gensim as gensimvis
warnings.filterwarnings('ignore')

pyLDAvis.enable_notebook()


In [2]:
# load data

def load_corpus(dataset_dir):
    corpus = []
    for month_dir in os.listdir(dataset_dir):
        month_path = os.path.join(dataset_dir, month_dir)
        if not os.path.isdir(month_path):
            continue
        data_files = os.listdir(month_path)
        for filename in tqdm(data_files, total=len(data_files), desc=month_dir):
            path = os.path.join(month_path, filename)
            if path.endswith('.jsonl') and 'annotated' in path:
                with open(path) as f:
                    for line in f:
                        tweet = json.loads(line)
                        corpus.append(tweet['candidates'])
    return corpus

DATASET_DIR = "/home/handsome/research/COVID19-Twitter-Topics/data/COVID-19-Tweets-geo-lda"
print(f'\nLoading data from {DATASET_DIR}', flush=True)

corpus = load_corpus(DATASET_DIR)


Loading data from /home/handsome/research/COVID19-Twitter-Topics/data/COVID-19-Tweets-geo-lda
2020-01: 100%|██████████| 484/484 [00:02<00:00, 208.12it/s]
2020-02: 100%|██████████| 1312/1312 [00:08<00:00, 161.85it/s]
2020-03: 100%|██████████| 1486/1486 [00:09<00:00, 162.50it/s]
2020-04: 100%|██████████| 1440/1440 [00:08<00:00, 160.44it/s]


In [3]:
# topics-10

DUMP_DIR = "/home/handsome/research/COVID19-Twitter-Topics/dump/lda/topics-10-mallet-iter2000-6.2"
model_path = os.path.join(DUMP_DIR, 'lda.model')
print(f'Loading model from {model_path}')
mallet_model = LdaMallet.load(model_path)
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_model)
corpus_bow = [model.id2word.doc2bow(text) for text in corpus]
topic_10_vis = gensimvis.prepare(model, corpus_bow, model.id2word)
pyLDAvis.save_html(topic_10_vis, os.path.join(DUMP_DIR, 'vis.html'))
topic_10_vis

Loading model from /home/handsome/research/COVID19-Twitter-Topics/dump/lda/topics-10-mallet-iter2000-6.2/lda.model


In [4]:
# topics-20

DUMP_DIR = "/home/handsome/research/COVID19-Twitter-Topics/dump/lda/topics-20-mallet-iter2000-6.2"
model_path = os.path.join(DUMP_DIR, 'lda.model')
print(f'Loading model from {model_path}')
mallet_model = LdaMallet.load(model_path)
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_model)
corpus_bow = [model.id2word.doc2bow(text) for text in corpus]
topic_20_vis = gensimvis.prepare(model, corpus_bow, model.id2word)
pyLDAvis.save_html(topic_20_vis, os.path.join(DUMP_DIR, 'vis.html'))
topic_20_vis

Loading model from /home/handsome/research/COVID19-Twitter-Topics/dump/lda/topics-20-mallet-iter2000-6.2/lda.model


In [5]:
# topics-100

DUMP_DIR = "/home/handsome/research/COVID19-Twitter-Topics/dump/lda/topics-100-mallet-iter2000-6.2"
model_path = os.path.join(DUMP_DIR, 'lda.model')
print(f'Loading model from {model_path}')
mallet_model = LdaMallet.load(model_path)
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_model)
corpus_bow = [model.id2word.doc2bow(text) for text in corpus]
topic_100_vis = gensimvis.prepare(model, corpus_bow, model.id2word)
pyLDAvis.save_html(topic_100_vis, os.path.join(DUMP_DIR, 'vis.html'))
topic_100_vis

Loading model from /home/handsome/research/COVID19-Twitter-Topics/dump/lda/topics-100-mallet-iter2000-6.2/lda.model
