In [1]:
# reference: 
# https://markroxor.github.io/gensim/tutorials/index.html
# https://radimrehurek.com/gensim/models/ldamodel.html
# https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

import pandas as pd
import re
import os
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from gensim.models.wrappers import LdaMallet

In [2]:
# read in the data
df = pd.read_csv("data.csv")

In [3]:
texts = []

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stop_words.add('nbsp') # to exclude '&nbsp;' in the text
stop_words.add('say') # 'say' and 'says' dominate topic words if not excluded (because our corpus is magazine articles?)
stop_words.add('says')
p_stemmer = PorterStemmer()

for t in df['Content']:
    # some contents are 'nan' so exclude them
    if type(t) == str:
        
        # strip html tags
        clean = re.compile('<.*?>')
        t = re.sub(clean, '', t)
        
        # lowercase
        raw = t.lower()
        
        # tokenize
        tokens = tokenizer.tokenize(raw)
        
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in stop_words]

        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

        # add tokens to list
        texts.append(stemmed_tokens)

In [4]:
# traverses texts assigning a unique integer id to each unique token
# while also collecting word counts and relevant statistics
# To see each token’s unique integer id, try print(dictionary.token2id).
dictionary = corpora.Dictionary(texts)

# convert the dictionary into a bag-of-words
# corpus is a list of vectors equal to the number of documents
# each document is a series of tuples; each tuple is (term ID, term frequency)
corpus = [dictionary.doc2bow(text) for text in texts]

In [5]:
%%time

# path to the mallet
os.environ['MALLET_HOME'] = 'C:/mallet-2.0.8'
mallet_path = 'C:/mallet-2.0.8/bin/mallet'

# LDA with Mallet implementation (uses Gibbs Sampling)
for i in range(4, 20):
    # construct a model
    ldamodel = LdaMallet(mallet_path, corpus=corpus, num_topics=i, id2word=dictionary)
    # save the model as files for later use
    ldamodel.save('models/lda_mallet_'+str(i)+'_topics.model')
    # print the topics, each represented as 5 words, just to check
    print('Lda model with', i, 'topics:', ldamodel.print_topics(num_topics=i, num_words=5))

Lda model with 4 topics: [(0, '0.016*"peopl" + 0.009*"work" + 0.006*"studi" + 0.006*"make" + 0.006*"thing"'), (1, '0.031*"student" + 0.027*"univers" + 0.012*"toronto" + 0.011*"program" + 0.011*"year"'), (2, '0.012*"research" + 0.010*"health" + 0.006*"develop" + 0.005*"work" + 0.005*"professor"'), (3, '0.009*"year" + 0.006*"ba" + 0.005*"toronto" + 0.005*"hous" + 0.005*"time"')]
Lda model with 5 topics: [(0, '0.012*"year" + 0.011*"univers" + 0.011*"toronto" + 0.010*"colleg" + 0.009*"student"'), (1, '0.027*"student" + 0.022*"univers" + 0.013*"program" + 0.010*"school" + 0.009*"year"'), (2, '0.008*"citi" + 0.007*"design" + 0.007*"compani" + 0.007*"make" + 0.007*"engin"'), (3, '0.014*"research" + 0.013*"health" + 0.012*"peopl" + 0.009*"studi" + 0.007*"care"'), (4, '0.009*"time" + 0.009*"peopl" + 0.008*"work" + 0.008*"day" + 0.007*"book"')]
Lda model with 6 topics: [(0, '0.042*"student" + 0.031*"univers" + 0.016*"program" + 0.014*"year" + 0.012*"faculti"'), (1, '0.011*"hous" + 0.010*"ba" + 0

Lda model with 15 topics: [(0, '0.026*"book" + 0.017*"write" + 0.013*"work" + 0.012*"english" + 0.011*"stori"'), (1, '0.019*"music" + 0.015*"play" + 0.015*"art" + 0.012*"show" + 0.012*"film"'), (2, '0.040*"univers" + 0.023*"student" + 0.021*"research" + 0.013*"educ" + 0.012*"world"'), (3, '0.029*"school" + 0.020*"work" + 0.019*"children" + 0.015*"year" + 0.014*"food"'), (4, '0.037*"health" + 0.020*"care" + 0.012*"peopl" + 0.012*"hospit" + 0.012*"medic"'), (5, '0.034*"research" + 0.015*"cell" + 0.011*"diseas" + 0.011*"brain" + 0.011*"cancer"'), (6, '0.019*"canada" + 0.014*"polit" + 0.012*"govern" + 0.012*"countri" + 0.009*"world"'), (7, '0.031*"toronto" + 0.030*"citi" + 0.024*"build" + 0.016*"design" + 0.012*"hous"'), (8, '0.082*"student" + 0.026*"univers" + 0.025*"program" + 0.022*"year" + 0.019*"faculti"'), (9, '0.032*"peopl" + 0.014*"thing" + 0.012*"differ" + 0.012*"make" + 0.010*"feel"'), (10, '0.024*"univers" + 0.019*"ba" + 0.017*"colleg" + 0.012*"caption" + 0.011*"canada"'), (11, 

In [None]:
%%time

# LDA with Gensim implementation (uses Variational Bayes sampling)
# I haven't learn this yet; I may compare the result with that of Mallet! 
for i in range(4, 20):
    # construct a model
    # passes value (the number of laps the model takes through the corpus) can be changed;
    # the larger the number of passes, the more accurate the model will be
    ldamodel = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary, passes=20)
    # save the model as files for later use
    ldamodel.save('models/lda_gensim_'+str(i)+'_topics.model')
    # print the topics, each represented as 5 words, just to check
    print('Lda model with', i, 'topics:', ldamodel.print_topics(num_topics=i, num_words=5))

## LDA model with 10 topics (topics are named by me):
### topic 0 health research: '0.023*"health" + 0.017*"research" + 0.011*"medic" + 0.010*"care" + 0.010*"patient" + 0.009*"diseas" + 0.008*"hospit" + 0.008*"cell" + 0.008*"dr" + 0.008*"studi"'
### topic 1 politics and history: 0.015*"women" + 0.012*"canada" + 0.010*"world" + 0.010*"polit" + 0.009*"war" + 0.008*"canadian" + 0.007*"countri" + 0.006*"peopl" + 0.006*"histori" + 0.006*"intern"'
### topic 2 city of toronto: 0.026*"toronto" + 0.022*"build" + 0.022*"citi" + 0.012*"project" + 0.012*"design" + 0.010*"plan" + 0.010*"commun" + 0.009*"centr" + 0.009*"hous" + 0.009*"space"'
### topic 3 cultural works: 0.017*"book" + 0.013*"write" + 0.011*"music" + 0.010*"art" + 0.010*"work" + 0.009*"stori" + 0.009*"play" + 0.008*"year" + 0.008*"english" + 0.008*"writer"'
### topic 4 university: 0.023*"univers" + 0.020*"colleg" + 0.014*"ba" + 0.013*"year" + 0.012*"student" + 0.011*"award" + 0.009*"toronto" + 0.008*"presid" + 0.008*"000" + 0.008*"alumni"',
### topic 5 time: 0.013*"day" + 0.009*"caption" + 0.009*"time" + 0.007*"year" + 0.005*"room" + 0.005*"back" + 0.005*"night" + 0.005*"food" + 0.005*"align" + 0.005*"id"',
### topic 6 people: 0.025*"peopl" + 0.017*"work" + 0.012*"thing" + 0.012*"time" + 0.012*"life" + 0.011*"make" + 0.010*"year" + 0.010*"experi" + 0.010*"school" + 0.009*"differ"',
### topic 7 student: 0.057*"student" + 0.040*"univers" + 0.022*"program" + 0.016*"faculti" + 0.016*"research" + 0.013*"year" + 0.013*"educ" + 0.011*"school" + 0.010*"support" + 0.010*"academ"',
### topic 8 business: 0.012*"busi" + 0.010*"compani" + 0.008*"manag" + 0.008*"canada" + 0.008*"peopl" + 0.007*"cent" + 0.006*"chang" + 0.006*"govern" + 0.006*"market" + 0.006*"polici"',
### topic 9 science and technology: 0.012*"engin" + 0.012*"comput" + 0.011*"technolog" + 0.011*"research" + 0.009*"scienc" + 0.008*"team" + 0.007*"make" + 0.007*"world" + 0.007*"system" + 0.007*"work"'

In [10]:
# load the Mallet LDA model with 10 topics
ldamodel = LdaMallet.load("lda_mallet_10_topics.model")

### What is the most likely topic of a document? Let's take an example to see how well the model works. This article #20 in the data file is about a book about World War II written by a U of T professor. Link to the article: https://magazine.utoronto.ca/research-ideas/culture-society/rosemary-sullivan-nazis-in-france-wwii-villa-airbel-emergency-rescue-committee/

In [11]:
ldamodel[corpus[20]]

[(0, 0.029817953546767105),
 (1, 0.35091023226616447),
 (2, 0.04896421845574388),
 (3, 0.16415568110483364),
 (4, 0.08505963590709353),
 (5, 0.10106716886377903),
 (6, 0.06559949780288764),
 (7, 0.0354676710608914),
 (8, 0.09635907093534213),
 (9, 0.022598870056497175)]

### The tags attached to the article are BOOKS, HISTORY, and SECOND WORLD WAR. Our model says it is most likely about topic 1 (potlitics) and second most likely about topic 3 (cultural works), which look not too bad! 

### We can infer the topic of unseen documents and update the model (by ldamodel.update(other_corpus)).