In [103]:
%%capture
import pandas as pd
import numpy as np

import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaMulticore
from gensim.models.phrases import Phraser, Phrases

import spacy
spacy.load('en', disable=['parser', 'ner'])

!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words("english")
stop_words.extend(['say', 'would', 'also', 'use'])

from pprint import pprint

from google.colab import drive

In [104]:
drive.mount("/content/gdrive/")

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [108]:
# Load data
data = pd.read_csv(r'/content/gdrive/My Drive/Datasets/Text - News Dataset/articles1.csv')

In [109]:
data.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,publication,author,date,year,month,url,content
0,23613,23617,42368,"Zuckerberg Gets Hacked, Turns Out to Have Really Lame Password - Breitbart",Breitbart,John Hayward,2016-06-06,2016.0,6.0,,"Over the weekend, Twitter and Pinterest accounts belonging to Facebook founder Mark Zuckerberg were hacked by a group called the “OurMine Team,” reportedly based in Saudi Arabi..."
1,39498,40983,59854,"Turkish bombing: 13 soldiers killed, 55 people wounded",CNN,Madison Park,2016-12-17,2016.0,12.0,,"(CNN) A car bomb exploded near a public bus in Turkey on Saturday, killing 13 soldiers and wounding 55 people, according to Turkish officials. Authorities later arrested 15 p..."
2,13203,13204,31940,"Fmr UFC Fighter Tim Kennedy Reenlists, Credits Trump: The Military ’Got Our Teeth Back’ - Breitbart",Breitbart,Trent Baker,2017-04-17,2017.0,4.0,,"Sunday on “Fox Friends Weekend,” former UFC Fighter Tim Kennedy said he decided to in the U. S. Army Special Forces after noticing the change in leadership under new Preside..."


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    4000 non-null   int64  
 1   Unnamed: 0.1  4000 non-null   int64  
 2   id            4000 non-null   int64  
 3   title         4000 non-null   object 
 4   publication   4000 non-null   object 
 5   author        3515 non-null   object 
 6   date          4000 non-null   object 
 7   year          4000 non-null   float64
 8   month         4000 non-null   float64
 9   url           0 non-null      float64
 10  content       4000 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 343.9+ KB


In [111]:
def preprocess_text(texts, allowed_pos_tags=['NOUN', 'VERB', 'ADV', 'ADJ']):
  preprocessed_texts = list()
  # Use simple preprocess to convert to lower case and remove punctuation then add to list if NOT in list of stop words
  texts = [[token for token in simple_preprocess(doc, deacc=True) if token not in stop_words] for doc in texts]
  bigrams = Phrases(texts, min_count=2)
  trigrams = Phrases(bigrams[texts], min_count=2)
  bigram_model = Phraser(bigrams)
  trigram_model = Phraser(trigrams)
  bi_gram_texts = [bigram_model[doc] for doc in texts]
  tri_gram_texts = [trigram_model[doc] for doc in bi_gram_texts]

  nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'] )
  for doc in tri_gram_texts:
    doc = nlp(' '.join(doc))
    # Use spacy to lemmatize and conform token's pos tag is in the allowed list
    preprocessed_texts.append([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
  return preprocessed_texts

In [112]:
# Cleaned up text
texts = preprocess_text(data.content)



In [113]:
# Create dictionary
id2word = corpora.Dictionary(texts)

# Create corpus
corpus = [id2word.doc2bow(text) for text in texts]

**Build LDA Model**

In [114]:
%%capture
# Get the coherence score for different number of topics
history = dict()
for i in range(20, 31):
  # Build LDA model with i number of topcics
  lda_model = LdaMulticore(corpus=corpus, num_topics=i, id2word=id2word, random_state=1, passes=5)
  # Get topic coherence and add to history
  coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=texts, corpus=corpus, dictionary=id2word)
  history[i] = coherence_model.get_coherence()

In [115]:
sorted(history.items(), key=lambda x: x[1], reverse=True)

[(20, 0.2830320464778939),
 (21, 0.27775344755493225),
 (23, 0.2764905216725714),
 (29, 0.27553904672756724),
 (22, 0.27291396023113595),
 (25, 0.2715243535102446),
 (28, 0.2661245151509894),
 (24, 0.26556283567460676),
 (30, 0.26510541240680496),
 (26, 0.264933749826053),
 (27, 0.26186433924385494)]

In [116]:
%%capture
# Build LDA model with number of topics that had the highest coherence score
lda_model = LdaMulticore(corpus=corpus, num_topics=20, id2word=id2word, random_state=1, passes=5)

In [117]:
pprint(lda_model.print_topics())

[(0,
  '0.015*"say" + 0.009*"company" + 0.007*"go" + 0.006*"make" + 0.005*"get" + '
  '0.005*"time" + 0.005*"people" + 0.005*"could" + 0.005*"work" + '
  '0.005*"year"'),
 (1,
  '0.011*"say" + 0.006*"report" + 0.006*"woman" + 0.005*"vote" + '
  '0.004*"company" + 0.004*"come" + 0.003*"sale" + 0.003*"call" + 0.003*"make" '
  '+ 0.003*"sell"'),
 (2,
  '0.011*"film" + 0.008*"show" + 0.007*"say" + 0.005*"make" + 0.004*"movie" + '
  '0.004*"play" + 0.004*"may" + 0.004*"star" + 0.004*"new" + 0.004*"report"'),
 (3,
  '0.009*"say" + 0.006*"customer" + 0.005*"people" + 0.004*"state" + '
  '0.004*"insurer" + 0.004*"make" + 0.004*"coverage" + 0.003*"pay" + '
  '0.003*"season" + 0.003*"drug"'),
 (4,
  '0.025*"say" + 0.008*"go" + 0.007*"people" + 0.006*"think" + 0.006*"know" + '
  '0.006*"make" + 0.004*"get" + 0.004*"police" + 0.004*"work" + 0.004*"take"'),
 (5,
  '0.009*"say" + 0.005*"go" + 0.005*"may" + 0.004*"state" + 0.004*"make" + '
  '0.004*"work" + 0.004*"people" + 0.004*"even" + 0.004*"trum

In [118]:
# Visualize topics
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=id2word)
vis

In [120]:
# Iterate through corpus to get dominant topic in each document
dominate_topics = list()
for i, row in enumerate(lda_model[corpus]):
  # Select the topic number and percent contribution of the dominant topic
  row = sorted(row, key=lambda x: x[1], reverse=True)[0]
  # Use index to append the text of the document along with the topic number and percent contribution
  dominate_topics.append([' '.join(texts[i]), row[0], row[1]])

# Create a dataframe using with the list from above
topic_sentences_df = pd.DataFrame()
topic_sentences_df[['text', 'topic number', 'percent contribution']] = np.array(dominate_topics, dtype=object)

In [121]:
pd.options.display.max_rows = None
# Group by topic number and show the three documents with the highest contribution for each topic number
topic_sentences_df.groupby('topic number').apply(lambda df: df.sort_values('percent contribution', ascending=False)[:3].drop(columns='topic number', axis=1))

Unnamed: 0_level_0,Unnamed: 1_level_0,text,percent contribution
topic number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2130,many american_citie transform millennial generation downtown turn bustling neighborhood full new apartment pricey coffee bar soon city may start run millennial number demograph...,0.997907
0,2675,last_fall apple chief design officer ask think draw make stuff friend have charlie_rose maybe ornament know installation design have have say responsible design apple include l...,0.997894
0,1794,big carmaker close huge global player come constantly build expensive year existence want tesla lately take exhort company remain lean mean painting little_guy constant_threat ...,0.997754
1,3264,travel remote colony eastern lowland southernmost reach vast offer agricultural giant soybean mennonite resident descendant european peasant carve settlement thick forest year ...,0.998907
1,3788,expert witness testify house_panel loot destruction priceless antique jihadist_groups across_middle_east identify collector europe_united_state top buyer ancient_artifact illeg...,0.997432
1,3003,dramatic scientist tick forward new set midnight indicate probability global catastrophe high say project physicist universally recognize indicator world vulnerability catastro...,0.996215
2,2679,film base jauntily cynical play may startle moment wisecrack silence patter raise mayor thinking write film humor frequently harsh probably version paper likely review hack kee...,0.998747
2,3173,officially say former male identity introduce quite literally way major hollywood figure could via cover preview connection mine muse sadly probably way ever agree put woman si...,0.995153
2,1439,freewheel comic career advertising garner worldwide acclaim satirical entertainment abound tv radio record die freberg die hospital confirm always hero carry brilliant legacy f...,0.99478
3,2609,tourist stroll yellowstone active geyser take selfie front thousand bubble boil mud pot hiss steam vent tread planet great time bomb enormous puzzle geophysicist decade use sei...,0.997376
