## Part 2: Topic Modelling

General comments and any shared processing here.


In [None]:
import pandas as pd
import logging
import nltk
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
from gensim.corpora import Dictionary




In [None]:
df_uni = pd.read_csv('Monash_crawled.csv')
docs = df_uni['body'].tolist()
print(len(docs))


366


In [None]:
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(docs)):
    docs[i] = docs[i].lower()  # Convert to lowercase.
    docs[i] = tokenizer.tokenize(docs[i])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]


In [None]:
# Lemmatize the documents.
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GHOST\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

2020-05-09 07:06:01,875 : INFO : collecting all words and their counts
2020-05-09 07:06:01,876 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-05-09 07:06:02,294 : INFO : collected 134249 word types from a corpus of 318836 words (unigram + bigrams) and 366 sentences
2020-05-09 07:06:02,294 : INFO : using 134249 counts as vocab in Phrases<0 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>


In [None]:
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

2020-05-09 07:06:03,211 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-05-09 07:06:03,453 : INFO : built Dictionary(15592 unique tokens: ['1am', '42c', 'a', 'a_well', 'aavoid']...) from 366 documents (total 338108 corpus positions)
2020-05-09 07:06:03,470 : INFO : discarding 14180 tokens: [('1am', 4), ('42c', 1), ('a', 340), ('aavoid', 1), ('acertainly', 1), ('actas', 1), ('acting', 16), ('advise', 7), ('aextremely', 1), ('air_pollution', 8)]...
2020-05-09 07:06:03,470 : INFO : keeping 1412 tokens which were in no less than 20 and no more than 183 (=50.0%) documents
2020-05-09 07:06:03,475 : INFO : resulting dictionary: Dictionary(1412 unique tokens: ['a_well', 'able', 'able_to', 'above', 'according']...)


In [None]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1412
Number of documents: 366


In [None]:
from gensim.models import LdaModel

# Set training parameters.
NUM_TOPICS = 30
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=NUM_TOPICS,
    passes=passes,
    eval_every=eval_every
)
outputfile = f'model{NUM_TOPICS}.gensim'
print("Saving model in " + outputfile)
print("")
model.save(outputfile)

2020-05-09 07:06:03,609 : INFO : using autotuned alpha, starting with [0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335]
2020-05-09 07:06:03,610 : INFO : using serial LDA version on this node
2020-05-09 07:06:03,619 : INFO : running online (multi-pass) LDA training, 30 topics, 20 passes over the supplied corpus of 366 documents, updating model once every 366 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2020-05-09 07:06:03,620 : INFO : PROGRESS: pass 0, at document #366/366
2020-05-09 07:06:08,650 : INFO : optimized alpha [0.027370028, 0.03198991, 0.030434292, 0.03052945, 0.02800653, 0.02938

2020-05-09 07:06:14,306 : INFO : topic #12 (0.021): 0.019*"his" + 0.015*"mr" + 0.012*"we_re" + 0.011*"trade" + 0.010*"impact" + 0.009*"japan" + 0.009*"re" + 0.008*"ship" + 0.008*"what" + 0.008*"hour"
2020-05-09 07:06:14,306 : INFO : topic #17 (0.029): 0.013*"mask" + 0.010*"face" + 0.010*"face_mask" + 0.007*"school" + 0.006*"spread" + 0.005*"covid" + 0.005*"state" + 0.005*"area" + 0.005*"our" + 0.005*"indonesia"
2020-05-09 07:06:14,307 : INFO : topic #23 (0.032): 0.010*"symptom" + 0.009*"wuhan" + 0.009*"spread" + 0.008*"patient" + 0.007*"outbreak" + 0.007*"disease" + 0.006*"may" + 0.006*"human" + 0.006*"hospital" + 0.006*"infected"
2020-05-09 07:06:14,307 : INFO : topic #26 (0.038): 0.014*"wuhan" + 0.013*"flight" + 0.010*"confirmed" + 0.009*"passenger" + 0.009*"hospital" + 0.007*"city" + 0.006*"she" + 0.006*"japan" + 0.006*"outbreak" + 0.006*"january_january"
2020-05-09 07:06:14,308 : INFO : topic diff=0.993429, rho=0.408248
2020-05-09 07:06:14,312 : INFO : PROGRESS: pass 5, at document

2020-05-09 07:06:20,171 : INFO : topic #12 (0.017): 0.024*"his" + 0.018*"mr" + 0.017*"trade" + 0.016*"we_re" + 0.012*"re" + 0.011*"impact" + 0.010*"what" + 0.010*"technology" + 0.009*"friend" + 0.009*"hour"
2020-05-09 07:06:20,171 : INFO : topic #22 (0.031): 0.020*"wuhan" + 0.018*"island" + 0.014*"flight" + 0.014*"christmas" + 0.014*"christmas_island" + 0.011*"mr" + 0.011*"pictured" + 0.011*"mask" + 0.010*"evacuee" + 0.009*"february_february"
2020-05-09 07:06:20,172 : INFO : topic #23 (0.032): 0.012*"symptom" + 0.011*"spread" + 0.010*"wuhan" + 0.009*"patient" + 0.008*"outbreak" + 0.008*"disease" + 0.007*"may" + 0.007*"human" + 0.006*"animal" + 0.006*"sars"
2020-05-09 07:06:20,173 : INFO : topic #26 (0.043): 0.015*"wuhan" + 0.014*"flight" + 0.012*"confirmed" + 0.010*"hospital" + 0.008*"passenger" + 0.007*"man" + 0.007*"infected" + 0.007*"melbourne" + 0.007*"city" + 0.007*"outbreak"
2020-05-09 07:06:20,174 : INFO : topic diff=0.701830, rho=0.301511
2020-05-09 07:06:20,177 : INFO : PROGRE

2020-05-09 07:06:25,772 : INFO : topic #12 (0.016): 0.028*"his" + 0.020*"mr" + 0.020*"trade" + 0.016*"we_re" + 0.013*"re" + 0.012*"friend" + 0.011*"impact" + 0.011*"what" + 0.010*"technology" + 0.010*"hour"
2020-05-09 07:06:25,773 : INFO : topic #13 (0.016): 0.038*"flight" + 0.036*"zealand" + 0.036*"new_zealand" + 0.024*"cruise" + 0.019*"airline" + 0.019*"way" + 0.016*"air" + 0.015*"sydney" + 0.015*"qantas" + 0.013*"travel"
2020-05-09 07:06:25,774 : INFO : topic #23 (0.033): 0.013*"symptom" + 0.012*"spread" + 0.010*"wuhan" + 0.009*"patient" + 0.009*"outbreak" + 0.008*"disease" + 0.008*"may" + 0.008*"human" + 0.007*"sars" + 0.007*"animal"
2020-05-09 07:06:25,775 : INFO : topic #22 (0.033): 0.021*"wuhan" + 0.018*"island" + 0.015*"flight" + 0.014*"christmas" + 0.014*"christmas_island" + 0.011*"mr" + 0.011*"pictured" + 0.011*"evacuee" + 0.009*"february_february" + 0.009*"mask"
2020-05-09 07:06:25,775 : INFO : topic #26 (0.046): 0.015*"wuhan" + 0.013*"confirmed" + 0.013*"flight" + 0.011*"ho

2020-05-09 07:06:31,239 : INFO : topic #12 (0.014): 0.030*"his" + 0.022*"trade" + 0.022*"mr" + 0.016*"we_re" + 0.014*"re" + 0.013*"friend" + 0.011*"impact" + 0.011*"what" + 0.010*"economic" + 0.010*"hour"
2020-05-09 07:06:31,240 : INFO : topic #13 (0.016): 0.045*"zealand" + 0.045*"new_zealand" + 0.042*"flight" + 0.021*"cruise" + 0.020*"airline" + 0.019*"qantas" + 0.018*"way" + 0.017*"air" + 0.015*"sydney" + 0.012*"travel"
2020-05-09 07:06:31,240 : INFO : topic #23 (0.033): 0.013*"symptom" + 0.013*"spread" + 0.009*"wuhan" + 0.009*"outbreak" + 0.009*"patient" + 0.009*"disease" + 0.008*"may" + 0.008*"human" + 0.007*"sars" + 0.007*"animal"
2020-05-09 07:06:31,241 : INFO : topic #22 (0.036): 0.021*"wuhan" + 0.019*"island" + 0.016*"flight" + 0.015*"christmas" + 0.014*"christmas_island" + 0.012*"mr" + 0.011*"pictured" + 0.011*"evacuee" + 0.009*"february_february" + 0.009*"quarantine"
2020-05-09 07:06:31,242 : INFO : topic #26 (0.048): 0.016*"wuhan" + 0.014*"confirmed" + 0.012*"flight" + 0.011

Saving model in model30.gensim



In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / NUM_TOPICS
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.1036.
[([(0.021346565, 'wuhan'),
   (0.01871146, 'island'),
   (0.016116045, 'flight'),
   (0.014505855, 'christmas'),
   (0.014258624, 'christmas_island'),
   (0.011681635, 'mr'),
   (0.011357654, 'pictured'),
   (0.0110079385, 'evacuee'),
   (0.009418573, 'february_february'),
   (0.009232303, 'quarantine'),
   (0.008890045, 'january_january'),
   (0.007933898, 'mask'),
   (0.007540776, 'his'),
   (0.0074587017, 'qantas'),
   (0.006839525, 'passenger'),
   (0.006557935, 'taken'),
   (0.0065198136, 'city'),
   (0.0062943576, 'citizen'),
   (0.0062512457, 'confirmed'),
   (0.005646144, 'home')],
  -0.6381938720396149),
 ([(0.03742969, 'ship'),
   (0.03247695, 'princess'),
   (0.03065182, 'passenger'),
   (0.030460846, 'diamond'),
   (0.030444162, 'diamond_princess'),
   (0.02874058, 'japan'),
   (0.027082168, 'cruise'),
   (0.018726109, 'cruise_ship'),
   (0.018040119, 'tested'),
   (0.017303823, 'positive'),
   (0.015935129, 'tested_positive'),
   (0.013230

In [None]:
# visualization
#!pip3 install pyLDAvis
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display)

2020-05-09 07:06:32,723 : INFO : Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-05-09 07:06:32,723 : INFO : NumExpr defaulting to 8 threads.
