# Load a trained model and use it to predict article topic

In [8]:
import pandas as pd
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
import json

In [9]:
with open('../secrets.json') as file:
    secrets = json.load(file)
    connection_string = secrets['connection_string']
db = create_engine(connection_string)
df = pd.read_sql('select * from news_article', con=db)

### Preprocessing articles to remove stopwords and lemmatize

In [16]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess(article):
    tokens = word_tokenize(article.lower()) # make all articles lower case
    words = [] # words resulting from applying the filters

    for token in tokens:
        if len(token) > 3 and token not in stop_words:
            words.append(lemmatizer.lemmatize(token))
    
    return words

### Load the model

In [18]:
model = LdaMulticore.load('models/news_lda_model')

### Use the model to make a prediction

In [19]:
test_article = df.iloc[84]['content']

In [20]:
preprocessed = preprocess(test_article)
preprocessed

['read',
 'time',
 'larger',
 'part',
 'amazon',
 'rainforest',
 'risk',
 'crossing',
 'tipping',
 'point',
 'could',
 'become',
 'savanna-type',
 'ecosystem',
 'previously',
 'thought',
 'according',
 'research',
 'research',
 'based',
 'computer',
 'model',
 'data',
 'analysis',
 'published',
 'journal',
 'nature',
 'communication',
 'rainforest',
 'sensitive',
 'change',
 'affect',
 'rainfall',
 'extended',
 'period',
 'rainfall',
 'drop',
 'certain',
 'threshold',
 'area',
 'shift',
 'savanna',
 'state',
 'around',
 'percent',
 'amazon',
 'rainfall',
 'level',
 'forest',
 'could',
 'exist',
 'either',
 'state',
 'rainforest',
 'savanna',
 'according',
 'finding',
 'say',
 'lead',
 'author',
 'arie',
 'staal',
 'formerly',
 'postdoctoral',
 'researcher',
 'stockholm',
 'resilience',
 'centre',
 'copernicus',
 'institute',
 'utrecht',
 'university',
 'conclusion',
 'concerning',
 'part',
 'amazon',
 'region',
 'currently',
 'receiving',
 'le',
 'rain',
 'previously',
 'trend',
 'expe

In [21]:
# create bag of words with preprocessed article
bow = model.id2word.doc2bow(preprocessed)

In [22]:
# make the prediction
pred = model[bow]
pred

[(0, 0.042757567), (1, 0.3867029), (3, 0.56937546)]

In [23]:
# find the topic with the best match
predicted_topic = pred[0][0]
best_match = pred[0][1]

for p in pred:
    if p[1] > best_match:
        predicted_topic = p[0]
        best_match = p[1]

In [24]:
predicted_topic

3