In [2]:
%%capture
import pandas as pd
import numpy as np

import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaMulticore
from gensim.models.phrases import Phraser, Phrases

import spacy
spacy.load('en', disable=['parser', 'ner'])

!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.extend(['say', 'would', 'also', 'use'])

from pprint import pprint

from google.colab import drive

In [3]:
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [4]:
# Load data
data = pd.read_csv(r'/content/gdrive/My Drive/Datasets/Text - News Dataset/articles1.csv')

In [5]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."


In [6]:
# Sample dataset to reduce time it takes to build model
data = data.sample(n=4000)
data.shape

(4000, 10)

In [8]:
def preprocess_text(texts, allowed_pos_tags=['NOUN', 'VERB', 'ADV', 'ADJ']):
  preprocessed_texts = list()
  # Use simple preprocess to convert to lower case and remove punctuation then add to list if NOT in list of stop words
  texts = [[token for token in simple_preprocess(doc, deacc=True) if token not in stop_words] for doc in texts]
  bigrams = Phrases(texts, min_count=2)
  trigrams = Phrases(bigrams[texts], min_count=2)
  bigram_model = Phraser(bigrams)
  trigram_model = Phraser(trigrams)
  bi_gram_texts = [bigram_model[doc] for doc in texts]
  tri_gram_texts = [trigram_model[doc] for doc in bi_gram_texts]

  nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'] )
  for doc in tri_gram_texts:
    doc = nlp(' '.join(doc))
    # Use spacy to lemmatize and conform token's pos tag is in the allowed list
    preprocessed_texts.append([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
  return preprocessed_texts

In [9]:
# Cleaned up text
texts = preprocess_text(data.content)



In [10]:
# Create dictionary
id2word = corpora.Dictionary(texts)

# Create corpus
corpus = [id2word.doc2bow(text) for text in texts]

**Build LDA Model**

In [144]:
%%capture
# Get the coherence score for different number of topics
history = dict()
for i in range(20, 31):
  # Build LDA model with i number of topcics
  lda_model = LdaMulticore(corpus=corpus, num_topics=i, id2word=id2word, random_state=1, passes=5)
  # Get topic coherence and add to history
  coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=texts, corpus=corpus, dictionary=id2word)
  history[i] = coherence_model.get_coherence()

In [145]:
sorted(history.items(), key=lambda x: x[1], reverse=True)

[(27, 0.2696518121703623),
 (21, 0.26705845864612376),
 (29, 0.2663829739764625),
 (30, 0.2641949471828947),
 (20, 0.2634629738242359),
 (24, 0.26310530729395426),
 (26, 0.2624653069413419),
 (25, 0.260377415167788),
 (22, 0.2602195490230664),
 (23, 0.2593544715970842),
 (28, 0.2550881806008998)]

In [11]:
%%capture
# Build LDA model with number of topics that had the highest coherence score
lda_model = LdaMulticore(corpus=corpus, num_topics=27, id2word=id2word, random_state=1, passes=5)

In [12]:
pprint(lda_model.print_topics())

[(22,
  '0.009*"say" + 0.006*"virus" + 0.005*"report" + 0.005*"man" + '
  '0.004*"session" + 0.004*"case" + 0.004*"show" + 0.003*"official" + '
  '0.003*"country" + 0.003*"address"'),
 (25,
  '0.010*"say" + 0.005*"report" + 0.005*"call" + 0.004*"government" + '
  '0.004*"campus" + 0.004*"may" + 0.004*"student" + 0.003*"country" + '
  '0.003*"claim" + 0.003*"man"'),
 (24,
  '0.009*"abortion" + 0.006*"say" + 0.006*"auction" + 0.005*"baby" + '
  '0.005*"new" + 0.004*"salesforce" + 0.004*"art" + 0.004*"case" + '
  '0.004*"doctor" + 0.003*"come"'),
 (26,
  '0.017*"say" + 0.008*"company" + 0.005*"case" + 0.005*"may" + 0.004*"go" + '
  '0.004*"report" + 0.003*"government" + 0.003*"make" + 0.003*"include" + '
  '0.003*"could"'),
 (8,
  '0.018*"say" + 0.010*"trump" + 0.007*"vote" + 0.006*"support" + '
  '0.006*"people" + 0.005*"conservative" + 0.005*"may" + 0.004*"state" + '
  '0.004*"call" + 0.004*"include"'),
 (18,
  '0.009*"say" + 0.008*"show" + 0.006*"people" + 0.005*"make" + 0.005*"see" + 

In [149]:
# Visualize topics
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=id2word)
vis

In [48]:
# Iterate through corpus to get dominant topic in each document
dominate_topics = list()
for i, row in enumerate(lda_model[corpus]):
  # Select the topic number and percent contribution of the dominant topic
  row = sorted(row, key=lambda x: x[1], reverse=True)[0]
  # Use index to append the text of the document along with the topic number and percent contribution
  dominate_topics.append([' '.join(texts[i]), row[0], row[1]])

# Create a dataframe using with the list from above
topic_sentences_df = pd.DataFrame()
topic_sentences_df[['text', 'topic number', 'percent contribution']] = np.array(dominate_topics, dtype=object)

In [102]:
pd.options.display.max_rows = None
# Group by topic number and show the three documents with the highest contribution for each topic number
topic_sentences_df.groupby('topic number').apply(lambda df: df.sort_values('percent contribution', ascending=False)[:3].drop(columns='topic number', axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,text,percent contribution
topic number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3582,throne season_six include speculation future event time year know annual appearance confuse explain excited know good_chance hype book resurrect murder bolton frey betray group...,0.997616
0,3592,question surprisingly fraught answer simple recently compile example man reddit twitter share photo woman find beautiful seemingly amazed enlighten taste woman celebrity believ...,0.996311
0,3798,beware late nasty virus sweep particularly elite citadel attack inexplicable fast light palsy stammering overbearing snobbery sometimes redness face include total delusionment\...,0.995701
1,3711,announce policy circle see asylum_seeker transfer first positive news asylum_seeker island circuit breaker receive support country however worth observing could consider good_d...,0.99739
1,3646,wake know drive early_hour monday_morning injure several people follow hold meet police_said_statement earlier_report give man age man search carry residential address area mee...,0.997009
1,3143,hundred law_enforcement_officer scour orlando flee trigger intense manhunt include search apartment complex place dozen school lockdown officer die search gunman die motorcycle...,0.996282
2,3967,annual migration wetland sure cuckoo go cuckoo tag sensor follow bird know great flier say live chinese capital project track bird migration incredibly perilous bird many peris...,0.997898
2,2371,soldier day american_soldier arrive help plan strategic city assault military increasingly find draw back regular combat situation year struggle offensive increase american\npa...,0.997425
2,3544,big key sign seem baffle deer stumble confusion howl pain find head hideously disfigure eat alive biologist discover turn even frightening reappearance parasite know new world\...,0.997362
3,3076,pull even ahead series recent unthinkable become least plausible elect must compete political map look forbid republican_primarie prove master nationalize political debate appe...,0.999082
