# Latent Dirichlet Allocation (LDA)

In [1]:
import os
import re
import json
import gensim
import pyLDAvis
import random

import numpy as np
import pandas as pd

from gensim.models.ldamulticore import LdaModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import train_test_split
from gensim.models.coherencemodel import CoherenceModel

import lib.helper as helper

from nltk.stem.porter import *

# Define which stemmer to use in the pipeline later
stemmer = PorterStemmer()

import seaborn as sns
import pyLDAvis.gensim as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict
  from collections import Counter, Iterable


## 1.  Retrieve Corpus

The corpus is being scraped by the "run_news_scrapes.py" script (and windows task scheduler) every 12 hours, a bit past midday and a bit past midnight.

The "bing" corpus are news titles and text extracts gotten from the bing news search API, using a few Home Office - related keywords.

The "disaster" corpus works likewise, but with keywords relating to natural disasters

The "RSS" corpus is plugged directly into a number of RSS feeds for world news sites and local british news sites, with no filters for news story types or subjects applied.

In [14]:
# Should be same path for all my PC's, it's where each scrape goes as a separate json file.
storage_path = "D:/Dropbox/news_crow/scrape_results"

# "bing" is targeted news search corpus, "RSS" is from specific world and local news feeds.
corpus_type = "RSS"

# Load up
corpus = helper.load_clean_corpus(storage_path, corpus_type)

# Make sure after cleaning etc it's indexed from 0
corpus.reset_index(inplace=True)
corpus.index.name = "node"

# See how it turned out
print(corpus.shape)
corpus.head()

Total files: 459
9.8 percent of files read.
19.6 percent of files read.
29.4 percent of files read.
39.2 percent of files read.
49.0 percent of files read.
58.8 percent of files read.
68.6 percent of files read.
78.4 percent of files read.
88.2 percent of files read.
98.0 percent of files read.
(113120, 9)


Unnamed: 0_level_0,index,title,summary,date,link,source_url,retrieval_timestamp,origin,clean_text
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,Hurricane Dorian lashes US as Bahamas counts cost,"Life-threatening US storm surges are feared, a...","Thu, 05 Sep 2019 16:03:44 GMT",https://www.bbc.co.uk/news/world-us-canada-495...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Hurricane Dorian lashes US as Bahamas counts c...
1,1,Kohistan video murders: Three guilty in 'honou...,They are relatives of a group of Pakistani wom...,"Thu, 05 Sep 2019 13:53:17 GMT",https://www.bbc.co.uk/news/world-asia-49592540,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Kohistan video murders: Three guilty in 'honou...
2,2,MH17 Ukraine plane crash: 'Key witness' released,A Ukrainian court releases a potentially key w...,"Thu, 05 Sep 2019 13:46:06 GMT",https://www.bbc.co.uk/news/world-europe-49591148,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,MH17 Ukraine plane crash: 'Key witness' releas...
3,3,Article 370: The weddings 'ruined' by Kashmir'...,Indian-administered Kashmir is under a securit...,"Thu, 05 Sep 2019 07:32:34 GMT",https://www.bbc.co.uk/news/world-asia-india-49...,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Article 70: The weddings 'ruined' by Kashmir's...
4,4,Syria war: Turkey warns Europe of new migrant ...,President Erdogan demands international help t...,"Thu, 05 Sep 2019 16:11:48 GMT",https://www.bbc.co.uk/news/world-europe-49599297,http://feeds.bbci.co.uk/news/world/rss.xml,2019-09-05 21:35:06.925873,rss_feed,Syria war: Turkey warns Europe of new migrant ...


## 2.  Additional preprocessing for LDA

### TODO:  Dump the stuff below into another "embedding model" in lib

In [15]:
# A quick utility function to pre-process the text
def preprocess_desc(description):
    return( [stemmer.stem(token) for token in simple_preprocess(str(description)) if token not in STOPWORDS] )

corpus["tokens"] = corpus["clean_text"].apply(preprocess_desc)

corpus.tail()

Unnamed: 0_level_0,index,title,summary,date,link,source_url,retrieval_timestamp,origin,clean_text,tokens
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
113115,1241258,Lincolnshire zoos told they can reopen - but i...,Zoos and safari parks in England can reopen fr...,"Wed, 10 Jun 2020 14:15:58 +0000",https://www.lincolnshirelive.co.uk/news/local-...,http://lincolnshirelive.co.uk/news/?service=rss,2020-06-10 21:33:35.747199,rss_feed,Lincolnshire zoos told they can reopen - but i...,"[lincolnshir, zoo, told, reopen, won, easi, zo..."
113116,1241260,What the shops are doing to get ready for reop...,It's a whole new world,"Wed, 10 Jun 2020 12:45:27 +0000",https://www.lincolnshirelive.co.uk/news/local-...,http://lincolnshirelive.co.uk/news/?service=rss,2020-06-10 21:33:35.747199,rss_feed,What the shops are doing to get ready for reop...,"[shop, readi, reopen, june, new, world]"
113117,1241261,Employees at firm with branches across Lincoln...,The business says it is 'in the process of mak...,"Wed, 10 Jun 2020 12:29:19 +0000",https://www.lincolnshirelive.co.uk/news/local-...,http://lincolnshirelive.co.uk/news/?service=rss,2020-06-10 21:33:35.747199,rss_feed,Employees at firm with branches across Lincoln...,"[employe, firm, branch, lincolnshir, face, red..."
113118,1241263,Woman heartbroken as brother with ‘huge heart’...,'He was on his own for two days before he died...,"Wed, 10 Jun 2020 11:56:09 +0000",https://www.lincolnshirelive.co.uk/news/local-...,http://lincolnshirelive.co.uk/news/?service=rss,2020-06-10 21:33:35.747199,rss_feed,Woman heartbroken as brother with ‘huge heart’...,"[woman, heartbroken, brother, huge, heart, die..."
113119,1241264,Boy left with life-threatening injuries after ...,Emergency services rushed to the scene.,"Wed, 10 Jun 2020 10:00:12 +0000",https://www.lincolnshirelive.co.uk/news/local-...,http://lincolnshirelive.co.uk/news/?service=rss,2020-06-10 21:33:35.747199,rss_feed,Boy left with life-threatening injuries after ...,"[boy, left, life, threaten, injuri, fall, roof..."


In [16]:
# Create the vocabulary record
dictionary = gensim.corpora.Dictionary(corpus['tokens'])

# Remove extreme values (words that are too rare, too common)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

# Create a BOW model
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus['tokens']]

# From that create the TF-IDF model
# THIS IS ANOTHER POINT THE CORPUS ORDERING COULD DETATCH FROM THE RAW DATA ORDERING
tfidf = gensim.models.TfidfModel(bow_corpus)
corpus['corpus_tfidf'] = tfidf[bow_corpus]

corpus['corpus_tfidf'].head()

node
0    [(0, 0.5844165488435726), (1, 0.18465724404048...
1    [(15, 0.26499582841098673), (16, 0.33914476848...
2    [(29, 0.18823052848058422), (30, 0.12822580194...
3    [(27, 0.2186338723464504), (42, 0.320511124676...
4    [(53, 0.21566407963033624), (54, 0.19632019679...
Name: corpus_tfidf, dtype: object

## 3. Testing a range of different-sized LDA models

In [17]:
random.seed(7)
trainset, testset = train_test_split(corpus, test_size=0.15)

In [None]:
# Loop through a number of different topic model sizes

results = pd.DataFrame()
for num_topics in range(3, 101):

    # Fit the lda model, with [num_topics] topics
    lda_model_tfidf = LdaModel(trainset['corpus_tfidf'],
                               num_topics=num_topics,
                               id2word=dictionary,
                               passes=2)
    
    # Get the perplexity
    perplexity = lda_model_tfidf.log_perplexity(testset['corpus_tfidf'])
    
    # Get the coherence
    cm = CoherenceModel(model=lda_model_tfidf, corpus=testset['corpus_tfidf'], coherence='u_mass')
    coherence = cm.get_coherence()
    
    # record
    results = results.append({"topics":num_topics, "perplexity":perplexity, "coherence":coherence}, ignore_index=True)
    
    # Report for my convenience
    print("tried {} topics".format(num_topics), "perplexity = {}".format(perplexity), "coherence = {}".format(coherence))

results.to_csv("working/disaster_lda_stats_clean.csv")

tried 3 topics perplexity = -9.717070055431488 coherence = -3.275418169279624
tried 4 topics perplexity = -9.98696780137105 coherence = -3.918239309327092
tried 5 topics perplexity = -10.31632201701593 coherence = -5.641808833341678
tried 6 topics perplexity = -10.523428508610781 coherence = -6.711458481706693
tried 7 topics perplexity = -10.741742861559784 coherence = -7.543547893846557
tried 8 topics perplexity = -10.934949187999004 coherence = -7.410943799692644
tried 9 topics perplexity = -11.213029324094038 coherence = -8.890806943827421
tried 10 topics perplexity = -11.521388104323158 coherence = -8.74814174162655
tried 11 topics perplexity = -11.978111746344286 coherence = -8.807782415232436
tried 12 topics perplexity = -12.450271408993284 coherence = -9.209376643957297
tried 13 topics perplexity = -12.844158813579533 coherence = -8.74743741182825
tried 14 topics perplexity = -13.091062029935214 coherence = -9.495897328157296
tried 15 topics perplexity = -13.367202813604624 cohe

In [None]:
# plot the results - PERPLEXITY
sns.scatterplot(x="topics", y="perplexity", color="blue", data=results)

In [None]:
# plot the results - COHERENCE
sns.scatterplot(x="topics", y="coherence", color="red", data=results)

## 4. Create the "best" model using all data

And parameters decided by test-time performance on perplexity and coherence.

In [9]:
random.seed(10)

# Fit the final lda model to all data
lda_model_tfidf = LdaModel(corpus['corpus_tfidf'],
                           num_topics=76,
                           id2word=dictionary,
                           passes=2)

# Get the perplexity, out of curiosity
perplexity = lda_model_tfidf.log_perplexity(corpus['corpus_tfidf'])
    
# Get the coherence, out of curiosity
cm = CoherenceModel(model=lda_model_tfidf, corpus=corpus['corpus_tfidf'], coherence='u_mass')
coherence = cm.get_coherence()

print("perplexity = {}".format(perplexity), "coherence = {}".format(coherence))

lda_model_tfidf.save("working/lda_model_tfidf.model")

perplexity = -47.24009145269341 coherence = -6.788528954290477


In [10]:
# Let's take a quick look at the topics picked out
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.119*"summer" + 0.097*"hope" + 0.073*"suffer" + 0.070*"leav" + 0.049*"soon" + 0.038*"return" + 0.038*"properti" + 0.032*"enjoy" + 0.030*"sunday" + 0.030*"usual"
Topic: 1 Word: 0.151*"measur" + 0.146*"staff" + 0.066*"launch" + 0.057*"taken" + 0.053*"spot" + 0.047*"spend" + 0.043*"critic" + 0.043*"review" + 0.032*"new" + 0.030*"ensur"
Topic: 2 Word: 0.066*"lead" + 0.048*"given" + 0.035*"read" + 0.032*"leed" + 0.028*"closur" + 0.024*"green" + 0.023*"urgent" + 0.023*"quot" + 0.022*"facebook" + 0.021*"difficult"
Topic: 3 Word: 0.075*"model" + 0.073*"greater" + 0.061*"previous" + 0.061*"king" + 0.053*"cambridg" + 0.048*"improv" + 0.046*"shown" + 0.042*"digit" + 0.042*"combin" + 0.031*"card"
Topic: 4 Word: 0.116*"wildfir" + 0.061*"firefight" + 0.058*"crew" + 0.056*"control" + 0.052*"firm" + 0.042*"save" + 0.036*"blaze" + 0.033*"servic" + 0.027*"thank" + 0.026*"energi"
Topic: 5 Word: 0.280*"arriv" + 0.143*"brit" + 0.066*"gener" + 0.054*"see" + 0.049*"uk" + 0.036*"come" + 0.032*

In [11]:
# Let's take a quick look at words the topics picked out
topic_word_dist = pd.DataFrame()

for idx, topic in lda_model_tfidf.print_topics(-1):
    
    # Record the topic index in a format R will like
    topic_index = "X" + str(idx)
    
    # Record the topic's key words as a single string
    # Split the words with a newline character while we're at it!
    topic_words = " ".join([x.split("*")[1].replace('"', '').strip() for x in topic.split("+")][0:5])
    
    # Dump them to the results dataframe
    topic_word_dist = topic_word_dist.append({"topic":topic_index, "keywords":topic_words}, ignore_index=True)
    
topic_word_dist

Unnamed: 0,keywords,topic
0,summer hope suffer leav soon,X0
1,measur staff launch taken spot,X1
2,lead given read leed closur,X2
3,model greater previous king cambridg,X3
4,wildfir firefight crew control firm,X4
...,...,...
71,suggest begin cross mid spell,X71
72,action requir alert huge agenc,X72
73,group chariti right mr natur,X73
74,turn dri februari region fall,X74


In [12]:
# Assign topic to document, also store the probability of that topic
doc_topics = []

count = 0
for index, row in corpus.iterrows():
    
    # Extract list of tuples of (topic, score) from the model for each doc
    topics = [x for x in lda_model_tfidf.get_document_topics(row['corpus_tfidf']) ]
    
    # Find the highest probability topic
    highest_scoring = sorted(topics, key=lambda x: x[1], reverse=True)[0]
    
    temp = {"node": int(index),
            "cluster": highest_scoring[0],
            "score": highest_scoring[1]}
    
    # Report processing progress!
    count = count + 1
    if count % 10000 == 0:
        print(count)
    
    doc_topics.append(temp)

doc_topic_dist = pd.DataFrame(doc_topics).set_index("node")

doc_topic_dist.to_csv("working/doc_topic_dist_backup.csv")

doc_topic_dist.head()

10000
20000


Unnamed: 0_level_0,cluster,score
node,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22,0.320047
1,72,0.328762
2,74,0.344483
3,72,0.3709
4,22,0.235525


In [13]:
corpus.join(doc_topic_dist).to_csv("working/RSS_clustered_lda.csv", index=False)