# Topic Modeling
Topic modeling is necessary so that news articles can be grouped by topic. This will provide additional ways to filter through the articles within the app.

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
import json

In [2]:
# only need to run this once
# nltk.download('wordnet')

In [3]:
with open('../secrets.json') as file:
    secrets = json.load(file)
    connection_string = secrets['connection_string']
db = create_engine(connection_string)
df = pd.read_sql('SELECT * FROM NAP.article', con=db)

df

Unnamed: 0,id,post_id,post_title,url,score,publisher,headline,date_published,content
0,1,fra8wl,Top US general resists Trump administration?s ...,https://mondoweiss.net/2020/03/top-u-s-general...,10285,mondoweiss,Top U.S. general resists Trump administration?...,2020-03-28 15:44:00,A brave U.S. army lieutenant general may be ri...
1,2,frcvgj,Experts believe the explosion of coronavirus c...,https://www.si.com/soccer/2020/03/25/atalanta-...,2854,si,Atalanta vs Valencia linked to accelerating co...,2020-03-25 00:00:00,ROME (AP) ? It was the biggest soccer game in ...
2,3,fr5uqd,Boris Johnson's government is reportedly furio...,https://www.businessinsider.com/coronavirus-bo...,79397,businessinsider,Boris Johnson's government is reportedly furio...,2020-03-29 00:00:00,"UK government officials say there'll be ""recko..."
3,4,fr7uzc,Toyota Gearing Up To Build Ventilators And Fac...,https://www.carscoops.com/2020/03/toyota-geari...,4988,carscoops,Toyota Gearing Up To Build Ventilators And Fac...,2020-03-27 22:36:00,"The United States will soon have over 100,000 ..."
4,5,frbkqr,Prime Minister Justin Trudeau says Health Cana...,https://www.ctvnews.ca/health/coronavirus/trud...,2341,ctvnews,Trudeau vows 'no corners cut' in accepting mas...,2020-03-29 13:04:00,TORONTO -- Prime Minister Justin Trudeau says ...
...,...,...,...,...,...,...,...,...,...
5550,5567,m4cvt7,Far-Right Bolivia Coup Leader Jeanine Añez Arr...,https://www.commondreams.org/news/2021/03/13/f...,1791,commondreams,Far-Right Bolivia Coup Leader Jeanine Añez Arr...,2021-03-13 00:00:00,Far-right Bolivian politician Jeanine Añez was...
5551,5568,m4a8ri,Men in uniform slaughtered dozens of people in...,https://observers.france24.com/en/africa/20210...,1362,france24,Ethiopia: video of Tigray massacre lifts lid o...,2021-03-12 00:00:00,"Screengrab of the video, filmed in the village..."
5552,5569,m4ai8h,Western Australia Election 2021: Labor has sta...,https://www.news.com.au/national/western-austr...,622,news,WA Election 2021: Historic landslide win retur...,2021-03-13 15:31:00,Re-elected Western Australian Premier Mark McG...
5553,5570,m44l44,Indigenous People Not Invited to UN Biodiversi...,https://www.ecowatch.com/un-biodiversity-talks...,1362,ecowatch,Indigenous People Not Invited to UN Biodiversi...,2021-03-12 16:19:06,Aiming to preserve 30 percent of the world's l...


### Only using a couple article for testing

In [20]:
# article1 = df.iloc[-1]['content']
# article2 = df.iloc[-2]['content']
# articles = [article1, article2]
articles = [df.iloc[i]['content'] for i in range(len(articles))]

### get words from the article

In [5]:
# tokens = word_tokenize(article)
# text = nltk.Text(tokens)
# text

### Show the collacations
These are words that appeared consecutively in the text. More specifically, words that appear consecutively and not by chance, so they have meaning when put together.

In [6]:
# text.collocations()

### Tokenize, lemmatize, remove stopwords, stem and discard words fewer than 3 chars
- Tokenization involves splitting the article into words.
- Lemmatization is getting words into a standard form. Words in third person are changed to first person and verbs are converted to present tense.
    - ex: disapperances -> disappearance
- Stemming is reducing words to their root form. This also converts all words to lower case.
    - ex: disappearance -> disappear
- Stopwords are words like "the", "a", "an", etc.

In [21]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [22]:
def preprocess(article):
    tokens = word_tokenize(article)
    words = [] # words resulting from applying the filters

    for token in tokens:
        if len(token) > 3 and token not in stop_words:
            words.append(stemmer.stem(lemmatizer.lemmatize(token)))
    
    return words

In [23]:
preprocessed_articles = [preprocess(article) for article in articles]
preprocessed_articles

[['brave',
  'u.s.',
  'armi',
  'lieuten',
  'gener',
  'risk',
  'career',
  'resist',
  'trump',
  'administr',
  'effort',
  'provok',
  'iran',
  'robert',
  'white',
  'command',
  'american',
  'forc',
  'iraq',
  'last',
  'week',
  'wrote',
  'york',
  'time',
  'call',
  'blunt',
  'memo',
  'oppos',
  'plan',
  'secretari',
  'state',
  'mike',
  'pompeo',
  'other',
  'attack',
  'iranian-',
  'militia',
  'insid',
  'iraq',
  'most',
  'world',
  'concentr',
  'fight',
  'coronaviru',
  'pandem',
  'small',
  'group',
  'power',
  'take',
  'advantag',
  'huge',
  'distract',
  'instig',
  'iran',
  'appar',
  'hope',
  'logic',
  'conflict',
  'prompt',
  'regim',
  'chang',
  'neutral',
  'teheran',
  'influenc',
  'mideast',
  'thi',
  'warmong',
  'group',
  'includ',
  'pompeo',
  'robert',
  'brien',
  'u.s.',
  'nation',
  'secur',
  'advis',
  'isra',
  'includ',
  'benjamin',
  'netanyahu',
  'element',
  'israel',
  'lobbi',
  'insid',
  'u.s.',
  'saudi',
  'ara

### Create a dictionary of text and bag of words

In [24]:
dictionary = Dictionary(preprocessed_articles)
corpus = [dictionary.doc2bow(article) for article in preprocessed_articles]

# for each tuple in the corpus, the first element is the word index and the second element
# is the number of times it appears in the text
for c in corpus:
    for item in c:
        print(f'{item} -- {dictionary[item[0]]}')

(0, 1) -- 17th
(1, 1) -- 5000
(2, 2) -- administr
(3, 1) -- advantag
(4, 1) -- advis
(5, 1) -- alli
(6, 1) -- although
(7, 3) -- american
(8, 1) -- appar
(9, 1) -- approach
(10, 1) -- arabia
(11, 1) -- argu
(12, 1) -- armi
(13, 1) -- assassin
(14, 2) -- attack
(15, 1) -- benjamin
(16, 1) -- blunt
(17, 1) -- brave
(18, 1) -- brien
(19, 1) -- broader
(20, 1) -- call
(21, 1) -- campaign
(22, 1) -- career
(23, 1) -- chang
(24, 1) -- combat
(25, 1) -- command
(26, 1) -- concentr
(27, 1) -- conflict
(28, 1) -- coronaviru
(29, 1) -- critic
(30, 1) -- crown
(31, 1) -- dismiss
(32, 1) -- distract
(33, 1) -- divert
(34, 1) -- donald
(35, 1) -- effort
(36, 1) -- element
(37, 1) -- emerg
(38, 1) -- enough
(39, 1) -- enter
(40, 1) -- escal
(41, 1) -- expert
(42, 1) -- face
(43, 1) -- facto
(44, 1) -- fight
(45, 1) -- find
(46, 2) -- forc
(47, 1) -- futur
(48, 1) -- gen.
(49, 4) -- gener
(50, 1) -- genuin
(51, 3) -- group
(52, 1) -- hand
(53, 2) -- hezbollah
(54, 2) -- hope
(55, 1) -- huge
(56, 4) -

(1661, 1) -- nevertheless
(1662, 1) -- non-invas
(1663, 1) -- overnight
(1664, 1) -- oxygen
(1665, 1) -- parliament
(1666, 1) -- pilgrim
(1667, 1) -- plateau
(1668, 2) -- raab
(1669, 1) -- rose
(1670, 1) -- shrine
(1671, 1) -- souri
(1672, 1) -- suggest
(1673, 1) -- taskforc
(1674, 1) -- tehran
(1675, 2) -- toll
(1676, 1) -- trend
(1677, 1) -- unlik
(1678, 1) -- verifi
(1679, 1) -- worsen
(218, 1) -- block
(325, 1) -- happen
(372, 1) -- make
(497, 1) -- support
(498, 1) -- sure
(638, 1) -- review
(998, 1) -- servic
(1163, 1) -- polici
(1680, 1) -- browser
(1681, 2) -- cooki
(1682, 1) -- inform
(1683, 1) -- javascript
(1684, 1) -- load
(1685, 1) -- pleas
(1686, 1) -- term
(37, 2) -- emerg
(49, 1) -- gener
(73, 1) -- last
(77, 1) -- lieuten
(98, 1) -- pandem
(156, 1) -- week
(179, 1) -- accord
(246, 1) -- close
(248, 1) -- come
(281, 1) -- effect
(323, 2) -- govern
(364, 1) -- local
(366, 1) -- look
(405, 1) -- night
(410, 1) -- offer
(422, 1) -- peopl
(444, 1) -- provinc
(524, 2) -- tue

### Create the LDA model for topic modeling
This trains a model and creates however many topics are specified. It doesn't assign names to the topics, so these need to be inferred.

In [33]:
model = LdaMulticore(corpus, num_topics=8, id2word=dictionary, passes=100, workers=2)

In [34]:
for topic in model.print_topics():
    print(f'Topic: {topic[0]}')
    print(f'Words: {topic[1]}')
    print()

Topic: 0
Words: 0.019*"said" + 0.012*"peopl" + 0.010*"coronaviru" + 0.009*"govern" + 0.008*"turtl" + 0.008*"appli" + 0.008*"cerb" + 0.007*"applic" + 0.006*"claim" + 0.006*"work"

Topic: 1
Words: 0.040*"angela" + 0.040*"christofil" + 0.023*"street" + 0.012*"said" + 0.011*"lockdown" + 0.011*"camden" + 0.008*"shop" + 0.008*"high" + 0.008*"equip" + 0.007*"make"

Topic: 2
Words: 0.008*"cooki" + 0.004*"review" + 0.004*"javascript" + 0.004*"browser" + 0.004*"term" + 0.004*"inform" + 0.004*"load" + 0.004*"block" + 0.004*"pleas" + 0.004*"support"

Topic: 3
Words: 0.020*"ventil" + 0.019*"said" + 0.013*"need" + 0.010*"coronaviru" + 0.008*"trump" + 0.008*"week" + 0.008*"work" + 0.008*"brief" + 0.008*"hancock" + 0.007*"make"

Topic: 4
Words: 0.011*"peopl" + 0.010*"game" + 0.010*"china" + 0.010*"atalanta" + 0.010*"valencia" + 0.010*"bergamo" + 0.009*"match" + 0.008*"case" + 0.008*"posit" + 0.008*"johnson"

Topic: 5
Words: 0.010*"said" + 0.009*"time" + 0.009*"taiwan" + 0.009*"iraq" + 0.008*"network" 