# Load a trained model and use it to predict article topic

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
import json

In [2]:
with open('../secrets.json') as file:
    secrets = json.load(file)
    connection_string = secrets['connection_string']
db = create_engine(connection_string)
df = pd.read_sql('select * from news_article', con=db)

### Preprocessing articles to remove stopwords and lemmatize

In [3]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocess(article):
    tokens = word_tokenize(article.lower()) # make all articles lower case
    words = [] # words resulting from applying the filters

    for token in tokens:
        if len(token) > 3 and token not in stop_words:
            words.append(lemmatizer.lemmatize(token))
    
    return words

### Load the model

In [6]:
model = LdaMulticore.load('models/lda/news_lda_model')

### Use the model to make a prediction

In [7]:
test_article = df.iloc[84]['content']

In [8]:
preprocessed = preprocess(test_article)
preprocessed

['resident',
 'wearing',
 'mask',
 'coronavirus',
 'collect',
 'food',
 'supply',
 'wuhan',
 'central',
 'china',
 'hubei',
 'province',
 'monday',
 'april',
 '2020.',
 'people',
 'coronavirus',
 'cause',
 'mild',
 'moderate',
 'symptom',
 'cause',
 'severe',
 'illness',
 'photo/ng',
 'guan',
 'resident',
 'wearing',
 'mask',
 'coronavirus',
 'collect',
 'food',
 'supply',
 'wuhan',
 'central',
 'china',
 'hubei',
 'province',
 'monday',
 'april',
 '2020.',
 'people',
 'coronavirus',
 'cause',
 'mild',
 'moderate',
 'symptom',
 'cause',
 'severe',
 'illness',
 'photo/ng',
 'guan',
 'associated',
 'press',
 'bangkok',
 'singapore',
 'reported',
 'biggest',
 'daily',
 'jump',
 'coronavirus',
 'infection',
 'linked',
 'foreign',
 'worker',
 'living',
 'crowded',
 'dormitory',
 'foreigner',
 'account',
 'third',
 'singapore',
 'workforce',
 'many',
 'people',
 'poorer',
 'asian',
 'country',
 'working',
 'construction',
 'shipping',
 'maintenance',
 'job',
 'support',
 'singapore',
 'trade

In [9]:
# create bag of words with preprocessed article
bow = model.id2word.doc2bow(preprocessed)

In [10]:
# make the prediction
pred = model[bow]
pred

[(0, 0.8745189), (2, 0.11079805), (3, 0.014026196)]

In [11]:
# find the topic with the best match
predicted_topic = pred[0][0]
best_match = pred[0][1]

for p in pred:
    if p[1] > best_match:
        predicted_topic = p[0]
        best_match = p[1]

In [12]:
predicted_topic

0

In [13]:
model.print_topics()

[(0,
  '0.009*"people" + 0.008*"covid-19" + 0.008*"health" + 0.007*"coronavirus" + 0.006*"vaccine" + 0.006*"country" + 0.006*"case" + 0.005*"pandemic" + 0.005*"government" + 0.005*"would"'),
 (1,
  '0.010*"police" + 0.007*"people" + 0.005*"protest" + 0.005*"woman" + 0.004*"navalny" + 0.004*"2020" + 0.004*"protester" + 0.004*"news" + 0.003*"right" + 0.003*"officer"'),
 (2,
  '0.008*"china" + 0.006*"government" + 0.006*"trump" + 0.006*"state" + 0.005*"would" + 0.005*"president" + 0.005*"also" + 0.005*"u.s." + 0.005*"biden" + 0.004*"country"'),
 (3,
  '0.008*"year" + 0.007*"climate" + 0.005*"world" + 0.004*"change" + 0.004*"also" + 0.003*"area" + 0.003*"water" + 0.003*"scientist" + 0.003*"found" + 0.003*"animal"')]