In [25]:
from texts2id_corpus import *
import pandas as pd
import numpy as np
import spacy
import gensim 
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# import pyLDAvis
# import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
from pprint import pprint
import time 
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Import Dataset

In [26]:
df = pd.read_csv('news_clean_march_with_title.csv')

In [27]:
df.text.iloc[0]

"(RTTNews) - Indian shares opened notably higher on Monday amid bets the RBI will cut interest rates at its upcoming policy meeting on April 4. Firm cues from global markets also underpinned investor sentiment.\nOther Asian markets remained broadly higher amid optimism about U.S.-China trade talks and as data showed China's factory activity in March unexpectedly grew for the first time in fourth months.\nThe benchmark BSE Sensex was up 270 points or 0.70 percent at 38,943 while the broader Nifty index was up 71 points or 0.61 percent at 11,695.\nSBI rose over 1 percent on reports that it is charting a new course for the revival of Jet Airways.\nBank of Baroda rallied nearly 2 percent after it became the third largest lender in the country with the merger of Dena Bank and Vijaya Bank.\nCipla shed 0.7 percent after receiving eight good manufacturing practice observations from the U.S. FDA.\nTata Motors soared 5 percent after selling its shareholding in TAL Manufacturing Solutions to TASL

### Preprocess Data

Here we are going to use the the title of the news and the first paragraph(at least has more than 20 words) as the input of our lda topic models. To preprocess the data, we tokenized words, removed stop words, created trigram words and did the lemmatization. We also create the Dictionary and Corpus as the input of the dataset. Here we applied a texts2id_corpus.py file we defined for preprocess.

In [28]:
def para_filter(x):
    para = x.split('\n')
    for p in para:
        if len(p.split(' '))>=20:
            return p

In [19]:
df = df.drop([13338,47335]) # the text has no paragraph length l

In [20]:
para_len = df.text.apply(lambda x: para_filter(x))

In [21]:
train_text = df.title+' '+para_len

In [22]:
train_text[train_text.isnull()]

Series([], dtype: object)

In [23]:
train_text = train_text.dropna()
train_text.isnull().value_counts()

False    51694
dtype: int64

In [24]:
start = time.time()
texts = train_text.tolist()
id2text, corpus, words_lemmatized  = text2corpus(texts, num_gram=3)
end = time.time()
print(end-start)

338.6564128398895


In [32]:
len(corpus)

51694

### LDA Mallet Model with 25 topics (After hyperparamter tuning)

To use the lda mallet model, we need to download and unzip the model to your path from http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

In [29]:
mallet_path = 'mallet-2.0.8/bin/mallet' 

In [33]:
from gensim.models.wrappers import LdaMallet

In [34]:
start = time.time()
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=25, id2word=id2text,random_seed=42)
end = time.time()
print(end-start)

121.29909992218018


* Here is the result of the 25 topics:

In [35]:
ldamallet.show_topics(num_topics=25)

[(0,
  '0.023*"campaign" + 0.022*"president" + 0.017*"political" + 0.015*"election" + 0.014*"party" + 0.013*"run" + 0.013*"trump" + 0.013*"vote" + 0.012*"candidate" + 0.010*"support"'),
 (1,
  '0.049*"country" + 0.032*"government" + 0.014*"official" + 0.013*"military" + 0.013*"force" + 0.012*"leader" + 0.011*"nation" + 0.011*"border" + 0.009*"chinese" + 0.009*"security"'),
 (2,
  '0.031*"technology" + 0.021*"service" + 0.019*"industry" + 0.013*"solution" + 0.012*"global" + 0.011*"development" + 0.011*"lead" + 0.010*"provide" + 0.010*"develop" + 0.009*"datum"'),
 (3,
  '0.056*"woman" + 0.033*"child" + 0.028*"school" + 0.026*"student" + 0.025*"family" + 0.019*"work" + 0.018*"parent" + 0.016*"young" + 0.014*"kid" + 0.013*"college"'),
 (4,
  '0.045*"state" + 0.024*"law" + 0.015*"rule" + 0.013*"public" + 0.012*"federal" + 0.011*"ban" + 0.011*"bill" + 0.011*"issue" + 0.010*"file" + 0.009*"decision"'),
 (5,
  '0.063*"market" + 0.032*"stock" + 0.028*"price" + 0.022*"share" + 0.020*"expect" + 0

### Compute Coherence Score

Coherence Score: The larger the coherence score, the better the result is

In [58]:
coherence_model_lda = CoherenceModel(model=ldamallet, texts=words_lemmatized, dictionary=id2text, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('coherence score with 25 topics is {}'.format(coherence_lda))

coherence score with 25 topics is 0.5533738692702787


### Map the topic back to documents 

In [36]:
def format_topics_sentences(ldamodel, corpus=corpus, texts=train_text):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=train_text.values)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,5.0,0.1683,"market, stock, price, share, expect, growth, e...",Sensex Jumps 270 Points In Early Trade; Nifty ...
1,1,15.0,0.1796,"star, series, movie, story, film, play, book, ...",April Fools' pranks: Some of greatest ever Com...
2,2,18.0,0.0829,"deal, plan, sign, week, leave, vote, agree, tr...",Brexiters cry betrayal but they're the ones to...
3,3,18.0,0.0904,"deal, plan, sign, week, leave, vote, agree, tr...",Brexiters cry betrayal but they're the ones to...
4,4,1.0,0.1575,"country, government, official, military, force...",Japan and Taiwan both scramble jets to confron...
5,5,1.0,0.1599,"country, government, official, military, force...",Japan and Taiwan both scramble jets to confron...
6,6,0.0,0.2797,"campaign, president, political, election, part...",Joe Biden tries to defuse first crisis - CNNPo...
7,7,0.0,0.2787,"campaign, president, political, election, part...",Joe Biden tries to defuse first crisis - CNNPo...
8,8,10.0,0.1193,"charge, case, man, arrest, accuse, police, cla...","Man killed after knocking on wrong door, Atlan..."
9,9,9.0,0.1127,"people, kill, attack, die, fire, family, death...","Man killed after knocking on wrong door, Atlan..."


In [42]:
df = df.drop([13338,47335])
df = df.reset_index()

In [59]:
result = pd.concat([df,df_dominant_topic[['Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords']]], axis=1)

In [52]:
result.head(5)

Unnamed: 0,index,thread.uuid,author,external_links,published,text,thread.site_full,thread.site_categories,thread.site_section,thread.section_title,thread.main_image,thread.social.facebook.comments,thread.social.facebook.likes,thread.social.facebook.shares,title,url,Dominant_Topic,Topic_Perc_Contrib,Keywords
0,0,7a52f2edf84a687cfc42c0d371573ac06455ad09,RTTNews,[],2019-04-01T20:28:00.000+03:00,(RTTNews) - Indian shares opened notably highe...,markets.businessinsider.com,"['financial_news', 'finance', 'options']",https://markets.businessinsider.com/news,Stock Market News | Financial &amp; Business N...,,0.0,0.0,0.0,Sensex Jumps 270 Points In Early Trade; Nifty ...,https://markets.businessinsider.com/news/stock...,5.0,0.1683,"market, stock, price, share, expect, growth, e..."
1,1,96c3d083c027175254e8846137a414ce67802411,Todd Leopold,"['http://hoaxes.org/aprilfool/P90', 'http://ww...",2019-04-01T14:38:00.000+03:00,(CNN) April Fools' Day ain't what it used to b...,edition.cnn.com,['media'],http://rss.cnn.com/rss/cnn_latest.rss,CNN.com - RSS Channel,https://cdn.cnn.com/cnnnext/dam/assets/1903281...,0.0,0.0,0.0,April Fools' pranks: Some of greatest ever,https://edition.cnn.com/2019/04/01/us/best-apr...,15.0,0.1796,"star, series, movie, story, film, play, book, ..."
2,2,c896d41943bd6ebbe1485906aa77bdfd214bf5de,"Jane Merrick, for CNN","['https://get.adobe.com/flashplayer/', 'https:...",2019-04-01T13:50:00.000+03:00,London (CNN) Even with the Brexit countdown cl...,us.cnn.com,['media'],http://us.cnn.com/world,"World news – breaking news, videos and headlin...",https://cdn.cnn.com/cnnnext/dam/assets/1903291...,0.0,0.0,0.0,Brexiters cry betrayal but they're the ones to...,http://us.cnn.com/2019/04/01/uk/brexit-may-bet...,18.0,0.0829,"deal, plan, sign, week, leave, vote, agree, tr..."
3,3,04b0e6b739c64d0aedc1762949ba5bae7d339cd9,"Jane Merrick, for CNN",['https://fave.api.cnn.io/v1/fav/?video=world/...,2019-04-01T13:50:00.000+03:00,London (CNN) Even with the Brexit countdown cl...,edition.cnn.com,['media'],http://rss.cnn.com/rss/cnn_latest,CNN.com - RSS Channel,https://cdn.cnn.com/cnnnext/dam/assets/1903291...,0.0,0.0,0.0,Brexiters cry betrayal but they're the ones to...,https://edition.cnn.com/2019/04/01/uk/brexit-m...,18.0,0.0904,"deal, plan, sign, week, leave, vote, agree, tr..."
4,4,2866b68e92d101d4a7afcd1590db295f443caff0,Ben Westcott,['http://english.chinamil.com.cn/view/2019-03/...,2019-04-01T13:30:00.000+03:00,"Hong Kong (CNN) Taiwan has accused China of ""r...",edition.cnn.com,['media'],http://rss.cnn.com/rss/edition_asia.rss,CNN.com - RSS Channel - Regions - Asia,https://cdn.cnn.com/cnnnext/dam/assets/1904011...,0.0,0.0,0.0,Japan and Taiwan both scramble jets to confron...,https://edition.cnn.com/2019/04/01/asia/china-...,1.0,0.1575,"country, government, official, military, force..."


### Save and Load model 

In [53]:
result.to_csv('result_with_25_topics.csv', index=False)

In [61]:
ldamallet.save('lda_mallet.model')

In [62]:
model =  gensim.models.wrappers.LdaMallet.load('lda_mallet.model')

In [63]:
model.show_topics(num_topics=25)

[(0,
  '0.023*"campaign" + 0.022*"president" + 0.017*"political" + 0.015*"election" + 0.014*"party" + 0.013*"run" + 0.013*"trump" + 0.013*"vote" + 0.012*"candidate" + 0.010*"support"'),
 (1,
  '0.049*"country" + 0.032*"government" + 0.014*"official" + 0.013*"military" + 0.013*"force" + 0.012*"leader" + 0.011*"nation" + 0.011*"border" + 0.009*"chinese" + 0.009*"security"'),
 (2,
  '0.031*"technology" + 0.021*"service" + 0.019*"industry" + 0.013*"solution" + 0.012*"global" + 0.011*"development" + 0.011*"lead" + 0.010*"provide" + 0.010*"develop" + 0.009*"datum"'),
 (3,
  '0.056*"woman" + 0.033*"child" + 0.028*"school" + 0.026*"student" + 0.025*"family" + 0.019*"work" + 0.018*"parent" + 0.016*"young" + 0.014*"kid" + 0.013*"college"'),
 (4,
  '0.045*"state" + 0.024*"law" + 0.015*"rule" + 0.013*"public" + 0.012*"federal" + 0.011*"ban" + 0.011*"bill" + 0.011*"issue" + 0.010*"file" + 0.009*"decision"'),
 (5,
  '0.063*"market" + 0.032*"stock" + 0.028*"price" + 0.022*"share" + 0.020*"expect" + 0

### Predict New data

In [None]:
test_texts = train_text.tolist()
test_id2text, test_corpus, test_words_lemmatized  = text2corpus(test_texts, num_gram=3)

In [None]:
df_predict_topcis = format_topics_sentences(ldamodel=ldamallet, corpus=test_corpus, texts=test_texts.values)