# Topic Modelling for News

![](https://images.unsplash.com/photo-1495020689067-958852a7765e?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1050&q=80)

Photo by [Roman Kraft](https://unsplash.com/photos/_Zua2hyvTBk)

This exercise is about modelling the main topics of a database of News headlines.

Begin by importing the needed libraries:

In [1]:
# TODO: import needed libraries
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim as gs
import string

Load the data in the file `random_headlines.csv`

In [2]:
# TODO: load the dataset
df = pd.read_csv('random_headlines.csv')
df.head()

Unnamed: 0,publish_date,headline_text
0,20120305,ute driver hurt in intersection crash
1,20081128,6yo dies in cycling accident
2,20090325,bumper olive harvest expected
3,20100201,replica replaces northernmost sign
4,20080225,woods targets perfect season


This is always a good idea to perform some EDA (exploratory data analytics) on a dataset...

In [3]:
# TODO: Perform a short EDA
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   publish_date   20000 non-null  int64 
 1   headline_text  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


Now perform all the needed preprocessing on those headlines: case lowering, tokenization, punctuation removal, stopwords removal, stemming/lemmatization.

In [4]:
# TODO: Preprocess the input data
pf = df
pf['headline_text'] = [x.lower() for x in pf['headline_text']]

tokens = []
for e,s in enumerate(pf['headline_text']):
    token = nltk.word_tokenize(s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    token = nltk.word_tokenize(s)
    token = [x for x in token if x not in nltk.corpus.stopwords.words('english')]
    lemma = nltk.stem.WordNetLemmatizer()
    token = [lemma.lemmatize(x) for x in token]
    tokens.append(token)
pf['headline_text'] = tokens
df['headline_text'] = pf['headline_text']
df['headline_text'].head()

0    [ute, driver, hurt, intersection, crash]
1                [6yo, dy, cycling, accident]
2          [bumper, olive, harvest, expected]
3     [replica, replaces, northernmost, sign]
4             [wood, target, perfect, season]
Name: headline_text, dtype: object

Now use Gensim to compute a BOW

In [9]:
# TODO: Compute the BOW using Gensim
dictionary = gs.corpora.Dictionary()
BOW = [dictionary.doc2bow(x) for x in df['headline_text']]
BOW

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (14, 1), (15, 1), (16, 1)],
 [(17, 1), (18, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (30, 1)],
 [(31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1)],
 [(37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)],
 [(16, 1), (43, 1), (44, 1), (45, 1), (46, 1)],
 [(47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)],
 [(53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)],
 [(30, 1), (59, 1), (60, 1), (61, 1), (62, 1)],
 [(63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1)],
 [(69, 1), (70, 1), (71, 1), (72, 1)],
 [(73, 1), (74, 1), (75, 1), (76, 1), (77, 1)],
 [(78, 1), (79, 1), (80, 1), (81, 1), (82, 1)],
 [(83, 1), (84, 1), (85, 1)],
 [(86, 1), (87, 1), (88, 1), (89, 1)],
 [(90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1)],
 [(96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1)],
 [(102, 1

Compute the TF-IDF using Gensim

In [17]:
# TODO: Compute TF-IDF
#for doc in BOW:
   #print([[dictionary[id], freq] for id, freq in doc])
tfidf = gs.models.TfidfModel(BOW)
for doc in tfidf[BOW]:
   print([[dictionary[id], np.around(freq,decimals=2)] for id, freq in doc])

[['crash', 0.31], ['driver', 0.35], ['hurt', 0.43], ['intersection', 0.6], ['ute', 0.49]]
[['6yo', 0.69], ['accident', 0.4], ['cycling', 0.49], ['dy', 0.35]]
[['bumper', 0.59], ['expected', 0.4], ['harvest', 0.44], ['olive', 0.54]]
[['northernmost', 0.59], ['replaces', 0.49], ['replica', 0.55], ['sign', 0.33]]
[['perfect', 0.59], ['season', 0.43], ['target', 0.45], ['wood', 0.51]]
[['adelaide', 0.3], ['dramatic', 0.48], ['draw', 0.36], ['leckie', 0.57], ['salvage', 0.48]]
[['future', 0.39], ['gauge', 0.66], ['group', 0.34], ['rail', 0.4], ['service', 0.36]]
[['ahead', 0.34], ['anti', 0.37], ['going', 0.47], ['hunting', 0.5], ['rally', 0.4], ['still', 0.33]]
[['aid', 0.35], ['congo', 0.49], ['dr', 0.42], ['first', 0.31], ['receive', 0.45], ['refugee', 0.4]]
[['sign', 0.38], ['agreement', 0.43], ['muslim', 0.5], ['rebel', 0.44], ['thailand', 0.48]]
[['application', 0.42], ['centre', 0.3], ['dubbo', 0.43], ['lodged', 0.51], ['shopping', 0.43], ['west', 0.31]]
[['action', 0.36], ['industri

[['assault', 0.33], ['accused', 0.3], ['man', 0.22], ['abduction', 0.45], ['rape', 0.38], ['golden', 0.46], ['square', 0.45]]
[['cycling', 0.41], ['liberal', 0.34], ['law', 0.3], ['wa', 0.27], ['one', 0.31], ['support', 0.3], ['proposed', 0.39], ['metre', 0.46]]
[['ta', 0.32], ['govt', 0.23], ['two', 0.28], ['system', 0.35], ['reject', 0.29], ['uni', 0.37], ['education', 0.34], ['tiered', 0.55]]
[['council', 0.54], ['merger', 0.84]]
[['plan', 0.31], ['stand', 0.43], ['merger', 0.47], ['loom', 0.52], ['coalition', 0.48]]
[['pool', 0.51], ['win', 0.31], ['gold', 0.37], ['freney', 0.71]]
[['say', 0.2], ['dog', 0.29], ['steve', 0.39], ['laurie', 0.44], ['microchipping', 0.51], ['rspcas', 0.51]]
[['tour', 0.44], ['pull', 0.52], ['cancellara', 0.73]]
[['abc', 0.25], ['talk', 0.24], ['bos', 0.29], ['reporter', 0.33], ['antso', 0.48], ['switkowski', 0.48], ['ziggy', 0.48]]
[['win', 0.25], ['bull', 0.38], ['grind', 0.5], ['fifth', 0.46], ['successive', 0.58]]
[['service', 0.33], ['fly', 0.45], 

[['infrastructure', 0.43], ['critical', 0.42], ['desalination', 0.53], ['plant', 0.38], ['declared', 0.46]]
[['law', 0.41], ['call', 0.33], ['official', 0.49], ['marine', 0.54], ['safety', 0.43]]
[['street', 0.34], ['cake', 0.46], ['bath', 0.46], ['malanda', 0.51], ['mud', 0.46]]
[['may', 0.29], ['flight', 0.37], ['qantas', 0.37], ['fear', 0.3], ['cut', 0.29], ['jetstar', 0.46], ['arrival', 0.51]]
[['export', 0.52], ['cattle', 0.53], ['gulf', 0.67]]
[['fire', 0.26], ['plastic', 0.47], ['unveils', 0.45], ['proof', 0.53], ['csiro', 0.48]]
[['five', 0.38], ['test', 0.32], ['day', 0.3], ['live', 0.38], ['ash', 0.4], ['third', 0.38], ['blog', 0.47]]
[['team', 0.37], ['loses', 0.39], ['tour', 0.35], ['mate', 0.51], ['cadel', 0.57]]
[['police', 0.18], ['qld', 0.24], ['hay', 0.44], ['fraud', 0.36], ['head', 0.3], ['investigated', 0.39], ['squad', 0.35], ['brian', 0.47]]
[['fight', 0.34], ['better', 0.38], ['sea', 0.39], ['south', 0.31], ['recognition', 0.48], ['islander', 0.51]]
[['man', 0.21]

[['escape', 0.35], ['close', 0.31], ['reef', 0.37], ['kimberley', 0.4], ['coral', 0.45], ['bleaching', 0.53]]
[['pool', 0.41], ['honour', 0.36], ['oval', 0.46], ['change', 0.27], ['name', 0.35], ['paraburdoo', 0.54]]
[['say', 0.26], ['trial', 0.34], ['health', 0.32], ['minister', 0.33], ['sa', 0.33], ['hill', 0.38], ['john', 0.41], ['program', 0.43]]
[['u', 0.25], ['drum', 0.37], ['investment', 0.37], ['leaf', 0.36], ['bracks', 0.44], ['biotech', 0.58]]
[['driving', 0.32], ['prompt', 0.3], ['penalty', 0.33], ['rd', 0.39], ['tougher', 0.36], ['reckless', 0.42], ['kapunda', 0.49]]
[['bali', 0.39], ['bomber', 0.4], ['testifies', 0.55], ['patek', 0.62]]
[['nsw', 0.29], ['stoush', 0.5], ['called', 0.46], ['leadership', 0.42], ['ceasefire', 0.52]]
[['death', 0.32], ['road', 0.36], ['rule', 0.42], ['inquest', 0.44], ['roller', 0.64]]
[['abuse', 0.34], ['offer', 0.33], ['compensation', 0.39], ['ups', 0.47], ['church', 0.39], ['anglican', 0.49]]
[['charge', 0.36], ['girl', 0.43], ['murder', 0.3

[['home', 0.35], ['body', 0.41], ['saddam', 0.53], ['flown', 0.66]]
[['cyclone', 0.31], ['alert', 0.35], ['red', 0.3], ['approach', 0.4], ['olwyn', 0.52], ['onslow', 0.52]]
[['share', 0.38], ['brisbane', 0.4], ['roar', 0.5], ['point', 0.46], ['united', 0.48]]
[['bush', 0.31], ['told', 0.3], ['roxon', 0.43], ['needed', 0.35], ['medico', 0.47], ['14000', 0.53]]
[['man', 0.24], ['charged', 0.31], ['woman', 0.3], ['head', 0.37], ['hammer', 0.52], ['hitting', 0.59]]
[['future', 0.36], ['labor', 0.34], ['govt', 0.27], ['fund', 0.35], ['campaign', 0.39], ['running', 0.45], ['scare', 0.46]]
[['election', 0.23], ['september', 0.31], ['change', 0.21], ['name', 0.27], ['greater', 0.35], ['27on', 0.45], ['hold27', 0.45], ['taree', 0.45]]
[['concern', 0.32], ['plan', 0.25], ['raised', 0.45], ['grave', 0.51], ['yarraville', 0.61]]
[['budget', 0.41], ['business', 0.39], ['small', 0.55], ['joy', 0.61]]
[['interview', 0.28], ['nrl', 0.42], ['bruno', 0.63], ['cullen', 0.58]]
[['u', 0.32], ['boost', 0.36

Finally compute the **LSA** (also called LSI) using Gensim, for a given number of Topics that you choose yourself

In [21]:
# TODO: Compute LSA
LSI = gs.models.LsiModel(tfidf[BOW],id2word=dictionary, num_topics=5)
LSI.print_topics()

[(0,
  '0.472*"man" + 0.423*"police" + 0.214*"charged" + 0.162*"court" + 0.131*"murder" + 0.121*"face" + 0.120*"missing" + 0.119*"crash" + 0.116*"new" + 0.108*"woman"'),
 (1,
  '-0.503*"second" + -0.478*"90" + -0.371*"abc" + -0.327*"news" + -0.327*"weather" + -0.240*"business" + -0.211*"sport" + 0.127*"man" + 0.090*"police" + -0.067*"market"'),
 (2,
  '-0.480*"man" + -0.254*"charged" + 0.229*"council" + 0.218*"new" + 0.205*"govt" + 0.199*"plan" + 0.135*"say" + 0.129*"call" + 0.121*"water" + -0.106*"murder"'),
 (3,
  '0.767*"police" + -0.292*"man" + 0.160*"probe" + -0.144*"charged" + -0.141*"court" + -0.140*"council" + 0.128*"investigate" + -0.115*"new" + -0.107*"plan" + -0.099*"face"'),
 (4,
  '-0.733*"abc" + 0.428*"second" + 0.380*"90" + -0.172*"sport" + -0.127*"entertainment" + -0.118*"business" + -0.118*"market" + -0.094*"weather" + -0.080*"analysis" + -0.050*"news"')]

For each of the topic, show the most significant words.

In [34]:
# TODO: Print the 3 or 4 most significant words of each topic
for x in LSI.print_topics():
    print(LSI.show_topic(x[0],topn=3))

[('man', 0.47157834362114), ('police', 0.4227569534964949), ('charged', 0.21412090104543233)]
[('second', -0.502544916289077), ('90', -0.47831193497680313), ('abc', -0.37082619602933314)]
[('man', -0.4801926654443056), ('charged', -0.2535087274155703), ('council', 0.22907086344131763)]
[('police', 0.766591421113027), ('man', -0.2923649236532191), ('probe', 0.15974312046965392)]
[('abc', -0.7326781288283416), ('second', 0.4281414587975297), ('90', 0.3797066647622566)]


What do you think about those results?

Now let's try to use LDA instead of LSA using Gensim

In [36]:
# TODO: Compute LDA
LDA = gs.models.LdaModel(tfidf[BOW],id2word=dictionary, num_topics=5, passes = 10)
LDA.print_topics()

[(0,
  '0.006*"abc" + 0.005*"news" + 0.004*"rural" + 0.003*"fire" + 0.003*"interview" + 0.003*"sport" + 0.003*"police" + 0.003*"stand" + 0.003*"weather" + 0.002*"election"'),
 (1,
  '0.006*"man" + 0.004*"charged" + 0.004*"court" + 0.004*"murder" + 0.003*"police" + 0.003*"woman" + 0.003*"group" + 0.003*"teen" + 0.003*"missing" + 0.003*"face"'),
 (2,
  '0.005*"second" + 0.003*"90" + 0.003*"plan" + 0.003*"police" + 0.003*"council" + 0.002*"job" + 0.002*"government" + 0.002*"call" + 0.002*"aust" + 0.002*"new"'),
 (3,
  '0.003*"rise" + 0.003*"rate" + 0.003*"council" + 0.003*"sale" + 0.003*"mine" + 0.003*"new" + 0.003*"price" + 0.003*"market" + 0.002*"boost" + 0.002*"day"'),
 (4,
  '0.006*"crash" + 0.004*"man" + 0.003*"police" + 0.003*"assault" + 0.003*"interview" + 0.003*"guilty" + 0.003*"injured" + 0.003*"killed" + 0.003*"final" + 0.003*"time"')]

In [37]:
# TODO: print the most frequent words of each topic
for x in LDA.print_topics():
    print(LDA.show_topic(x[0],topn=3))

[('abc', 0.0055296086), ('news', 0.00472533), ('rural', 0.0037029174)]
[('man', 0.0056166234), ('charged', 0.0039644665), ('court', 0.0038078208)]
[('second', 0.0050244583), ('90', 0.0033741007), ('plan', 0.003218034)]
[('rise', 0.0031150908), ('rate', 0.003046965), ('council', 0.002942241)]
[('crash', 0.0061222254), ('man', 0.0043205745), ('police', 0.0032582765)]


Now, how does it work with LDA?

Let's make some visualization of the LDA results using pyLDAvis.

In [45]:
# TODO: show visualization results of the LDA
import pyLDAvis.gensim_models as genvis
#import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = genvis.prepare(topic_model=LDA, corpus=BOW, dictionary=dictionary)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


Depending on your results, you can try to fine tune the algorithm: number of topics, hyperparameters...
And check with others their results.