# Topic Modeling
Topic modeling is necessary so that news articles can be grouped by topic. This will provide additional ways to filter through the articles within the app.

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel # for evaluating LDA model
import json
import re

In [2]:
# only need to run this once
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/markus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
with open('../secrets.json') as file:
    secrets = json.load(file)
    connection_string = secrets['connection_string']
db = create_engine(connection_string)
df = pd.read_sql('select * from news_article', con=db)

### code for exporting data from AWS database
This was only needed once to move the data to the new database so this can be ignored

In [4]:
# df['date_published'].ffill(inplace=True)
# df = df.drop(columns='id')
# df = df[df['content'].str.len() > 0]
# df = df[df['headline'].str.len() > 0]
# df.to_csv('articles.csv', index=False)

### Get the content of the articles

In [5]:
# article1 = df.iloc[-1]['content']
# article2 = df.iloc[-2]['content']
# articles = [article1, article2]
articles = [df.iloc[i]['content'] for i in range(len(df))]

### Get words from the article

In [6]:
# tokens = word_tokenize(article)
# text = nltk.Text(tokens)
# text

### Show the collacations
These are words that appeared consecutively in the text. More specifically, words that appear consecutively and not by chance, so they have meaning when put together.

In [7]:
# text.collocations()

### Tokenize, lemmatize, remove stopwords, stem and discard words fewer than 3 chars
- Tokenization involves splitting the article into words.
- Lemmatization is getting words into a standard form. Words in third person are changed to first person and verbs are converted to present tense.
    - ex: disapperances -> disappearance
- Stemming is reducing words to their root form. This also converts all words to lower case.
    - ex: disappearance -> disappear
- Stopwords are words like "the", "a", "an", etc.

In [8]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [9]:
def preprocess(article):
    tokens = word_tokenize(article.lower()) # make all articles lower case
    words = [] # words resulting from applying the filters

    for token in tokens:
        if len(token) > 3 and token not in stop_words:
            words.append(lemmatizer.lemmatize(token))
    
    return words

In [10]:
preprocessed_articles = [preprocess(article) for article in articles]
# preprocessed_articles

### Create a dictionary of text and bag of words
A bag of words is a list of tuples of the form (token id, count of token)

In [11]:
dictionary = Dictionary(preprocessed_articles)
corpus = [dictionary.doc2bow(article) for article in preprocessed_articles]

# for each tuple in the corpus, the first element is the word index and the second element
# is the number of times it appears in the text
# for c in corpus:
#     for item in c:
#         print(f'{item} -- {dictionary[item[0]]}')

### Create the LDA model for topic modeling
This trains a model and creates however many topics are specified. It doesn't assign names to the topics, so these need to be inferred.

In [12]:
model = LdaMulticore(corpus, num_topics=4, id2word=dictionary, passes=10, workers=2, chunksize=100)

### Structure the topics as a dict and parse out the words for each topic

In [13]:
model.print_topics()

[(0,
  '0.016*"said" + 0.006*"china" + 0.006*"government" + 0.006*"would" + 0.005*"state" + 0.005*"trump" + 0.005*"also" + 0.005*"country" + 0.005*"president" + 0.004*"year"'),
 (1,
  '0.008*"said" + 0.008*"year" + 0.006*"climate" + 0.004*"world" + 0.004*"change" + 0.004*"also" + 0.003*"water" + 0.003*"animal" + 0.003*"area" + 0.003*"found"'),
 (2,
  '0.013*"said" + 0.011*"police" + 0.007*"people" + 0.006*"protest" + 0.005*"navalny" + 0.005*"woman" + 0.004*"2020" + 0.004*"protester" + 0.004*"right" + 0.004*"officer"'),
 (3,
  '0.016*"said" + 0.010*"people" + 0.009*"covid-19" + 0.009*"health" + 0.008*"vaccine" + 0.008*"coronavirus" + 0.007*"case" + 0.006*"country" + 0.006*"pandemic" + 0.005*"virus"')]

In [14]:
raw_topics = model.print_topics()
topics = {}

for topic in raw_topics:
    topic_no = topic[0]
    topic_words = topic[1].split('+')
    
    # find words using regex and remove double quotes
    topic_words = [re.search('\".+\"', words).group().replace('"', '') for words in topic_words]
    
    # add topic_no and words to topics
    topics.update({topic_no: topic_words})

### Display the topics

In [15]:
for key in topics.keys():
    print(f'topic {key}')
    print(topics[key])
    print()

topic 0
['said', 'china', 'government', 'would', 'state', 'trump', 'also', 'country', 'president', 'year']

topic 1
['said', 'year', 'climate', 'world', 'change', 'also', 'water', 'animal', 'area', 'found']

topic 2
['said', 'police', 'people', 'protest', 'navalny', 'woman', '2020', 'protester', 'right', 'officer']

topic 3
['said', 'people', 'covid-19', 'health', 'vaccine', 'coronavirus', 'case', 'country', 'pandemic', 'virus']



### Topics
0. government/politics
1. science
2. social
3. coronavirus

### Save the model to a file

In [16]:
model.save('models/news_lda_model')

### Making predictions with the trained model

In [17]:
test_article = df.iloc[84]['content'] # nature
# test_article = df.iloc[3238]['content'] # tech
# test_article = df.iloc[5592]['content'] # science
# test_article = df.iloc[1339]['content'] # science
# test_article = df.iloc[4992]['content'] # business
# test_article = df.iloc[27]['content'] # 

In [18]:
preprocessed = preprocess(test_article)
# preprocessed

### turn the test article into a bag of words using the existing dicitonary

In [19]:
# model.id2word is the dictionary that was used to train the model
# this is useful so that I don't have to pickle the dictionary separately
bow = model.id2word.doc2bow(preprocessed)

In [20]:
pred = model[bow]
pred

[(0, 0.27824578), (1, 0.010174701), (2, 0.087160416), (3, 0.6244191)]

### Find the topic with the best match

In [21]:
predicted_topic = pred[0][0]
best_match = pred[0][1]

for p in pred:
    if p[1] > best_match:
        predicted_topic = p[0]
        best_match = p[1]

### The predicted topic

In [22]:
predicted_topic

3

In [23]:
# df[df['content'].str.contains('canada')]

### some helpful methods to use with the model

In [24]:
# show topics for a specific document
model.get_document_topics(bow)

[(0, 0.27820295), (1, 0.010230508), (2, 0.08716508), (3, 0.6244015)]

In [25]:
# get probability of each word in each topic
model.get_topics()

array([[3.29945651e-05, 1.31194794e-03, 2.41260670e-04, ...,
        9.85758675e-07, 3.64825723e-07, 3.64825723e-07],
       [1.13724534e-04, 1.08553912e-03, 9.24471649e-04, ...,
        8.75822025e-07, 8.76245963e-07, 8.76245963e-07],
       [8.34769526e-05, 5.16337877e-07, 6.42332743e-05, ...,
        1.67864425e-06, 3.42393264e-06, 3.42393264e-06],
       [3.48626927e-04, 1.81667809e-03, 1.08105596e-03, ...,
        5.58041961e-07, 6.70242457e-07, 6.70242457e-07]], dtype=float32)

In [26]:
# Show top words for a specific topic.
# I believe the number along with each word is the probability that a document would be part
# of that topic if it contained that word, but I would need to double check that in the docs.
model.show_topic(0, topn=20)

[('said', 0.015941896),
 ('china', 0.006209723),
 ('government', 0.0061553097),
 ('would', 0.005607035),
 ('state', 0.0051090037),
 ('trump', 0.0049358923),
 ('also', 0.0047460306),
 ('country', 0.004741364),
 ('president', 0.00473833),
 ('year', 0.004197876),
 ('u.s.', 0.0041286964),
 ('company', 0.0039096377),
 ('report', 0.003684636),
 ('biden', 0.0035926218),
 ('chinese', 0.0033445929),
 ('official', 0.0031176542),
 ('right', 0.0030737636),
 ('time', 0.002836811),
 ('people', 0.0027904343),
 ('united', 0.002661332)]

In [27]:
# Show each topic and it's coherence score. This will be useful for evaluating the model.
model.top_topics(corpus)

[([(0.015775748, 'said'),
   (0.009961982, 'people'),
   (0.008805061, 'covid-19'),
   (0.008625574, 'health'),
   (0.007893125, 'vaccine'),
   (0.00754997, 'coronavirus'),
   (0.0065441034, 'case'),
   (0.0061498303, 'country'),
   (0.005538134, 'pandemic'),
   (0.0054097585, 'virus'),
   (0.004563001, 'government'),
   (0.0040496625, 'first'),
   (0.0039926535, 'death'),
   (0.003987456, 'also'),
   (0.0038391184, 'would'),
   (0.0038302927, 'week'),
   (0.0034949782, 'number'),
   (0.0033850733, 'time'),
   (0.00330971, 'home'),
   (0.0032336428, 'public')],
  -0.8262141936458304),
 ([(0.015941896, 'said'),
   (0.006209723, 'china'),
   (0.0061553097, 'government'),
   (0.005607035, 'would'),
   (0.0051090037, 'state'),
   (0.0049358923, 'trump'),
   (0.0047460306, 'also'),
   (0.004741364, 'country'),
   (0.00473833, 'president'),
   (0.004197876, 'year'),
   (0.0041286964, 'u.s.'),
   (0.0039096377, 'company'),
   (0.003684636, 'report'),
   (0.0035926218, 'biden'),
   (0.00334459

### Model evaluation

In [28]:
# create several LDA models with different k values
model1 = LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=5, workers=2, chunksize=100)
model2 = LdaMulticore(corpus, num_topics=4, id2word=dictionary, passes=5, workers=2, chunksize=100)
model3 = LdaMulticore(corpus, num_topics=5, id2word=dictionary, passes=5, workers=2, chunksize=100)
model4 = LdaMulticore(corpus, num_topics=6, id2word=dictionary, passes=5, workers=2, chunksize=100)
model5 = LdaMulticore(corpus, num_topics=7, id2word=dictionary, passes=5, workers=2, chunksize=100)
model6 = LdaMulticore(corpus, num_topics=8, id2word=dictionary, passes=5, workers=2, chunksize=100)

In [29]:
# create coherence models to evaluate each LDA model
cm1 = CoherenceModel(model=model1, corpus=corpus, coherence='u_mass')
cm2 = CoherenceModel(model=model2, corpus=corpus, coherence='u_mass')
cm3 = CoherenceModel(model=model3, corpus=corpus, coherence='u_mass')
cm4 = CoherenceModel(model=model4, corpus=corpus, coherence='u_mass')
cm5 = CoherenceModel(model=model5, corpus=corpus, coherence='u_mass')
cm6 = CoherenceModel(model=model6, corpus=corpus, coherence='u_mass')

In [30]:
# show the coherence score for each model, the score closest to 0 is the best model
print(f'3 topics: {cm1.get_coherence()}')
print(f'4 topics: {cm2.get_coherence()}')
print(f'5 topics: {cm3.get_coherence()}')
print(f'6 topics: {cm4.get_coherence()}')
print(f'7 topics: {cm5.get_coherence()}')
print(f'8 topics: {cm6.get_coherence()}')

3 topics: -1.230026945660962
4 topics: -1.1466544784621742
5 topics: -1.3507580140997066
6 topics: -1.4155287392165115
7 topics: -2.9326326634684228
8 topics: -1.47439138870618
