# Dynamic Topic Modeling

In [1]:
# setting up our imports

from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
import numpy
from gensim.matutils import hellinger
from gensim.models.wrappers.dtmmodel import DtmModel

# 1)-Loading data

In [2]:
import pandas as pd

df_fake = pd.read_csv('fake.csv')
df_fake = df_fake[['title', 'text', 'language', 'published']]
df_fake = df_fake.loc[(pd.notnull(df_fake.title)) & (pd.notnull(df_fake.text)) & \
                      (pd.notnull(df_fake.published)) & (df_fake.language == 'english')]
df_fake.head()

Unnamed: 0,title,text,language,published
0,Muslims BUSTED: They Stole Millions In Gov’t B...,Print They should pay all the back all the mon...,english,2016-10-26T21:41:00.000+03:00
1,Re: Why Did Attorney General Loretta Lynch Ple...,Why Did Attorney General Loretta Lynch Plead T...,english,2016-10-29T08:47:11.259+03:00
2,BREAKING: Weiner Cooperating With FBI On Hilla...,Red State : \nFox News Sunday reported this mo...,english,2016-10-31T01:41:49.479+02:00
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,english,2016-11-01T05:22:00.000+02:00
4,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,2016-11-01T21:56:00.000+02:00


# 2)-Preprocessing

In [3]:
import re
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

import nltk
nltk.download('wordnet') # download wordnet to be used in lemmatization
from nltk.stem import WordNetLemmatizer

def preprocess(texts):
    # tokenization
    texts = [re.findall(r'\w+', line.lower()) for line in texts]
    # remove stopwords
    texts = [remove_stopwords(' '.join(line)).split() for line in texts]
    # remove punctuation
    texts = [strip_punctuation(' '.join(line)).split() for line in texts]
    # remove words that are only 1-2 character
    texts = [[token for token in line if len(token) > 2] for line in texts]
    # remove numbers
    texts = [[token for token in line if not token.isnumeric()] for line in texts]
    # lemmatization 
    lemmatizer = WordNetLemmatizer()
    texts = [[word for word in lemmatizer.lemmatize(' '.join(line), pos='v').split()] for line in texts]
    
    return texts

# pre-processing
processed_texts = preprocess(df_fake.text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\69785hsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from gensim.models.phrases import Phrases, Phraser

# training for bigram collocation detection
phrases = Phrases(processed_texts, min_count=1, threshold=0.8, scoring='npmi')
bigram = Phraser(phrases)
# merging detected collocations with data
processed_texts = list(bigram[processed_texts])

In [5]:
from gensim import corpora

class DTMcorpus(corpora.textcorpus.TextCorpus):
    def get_texts(self):
        return self.input

    def __len__(self):
        return len(self.input)

corpus = DTMcorpus(processed_texts)
corpus

<__main__.DTMcorpus at 0x298bf0d63c8>

In [6]:
df_fake["published"] = pd.to_datetime(df_fake["published"])
t1 = df_fake.loc[(df_fake["published"].dt.month == 10)].shape[0]
t2 = df_fake.loc[(df_fake["published"].dt.month == 11)].shape[0]

In [7]:
print(t1, t2)

5710 5967


In [8]:
time_slices = [t1, t2]

# 3)-Train

In [9]:
#dtm_path = "/Users/69785hsh/Downloads/Data Science/Learning Projects/fake_news/dtm-win64.exe"

In [10]:
#model = DtmModel(dtm_path, corpus, time_slices, num_topics=10, id2word=corpus.dictionary,initialize_lda=True)

In [11]:
#model.save('dtm_model')

In [12]:
model = DtmModel.load('dtm_model')

# 4)- Results

In [13]:
model.show_topic(topicid=1, time=0, topn=10)

[(0.039834287594018344, 'trump'),
 (0.01461527120528731, 'clinton'),
 (0.014525353329559152, 'election'),
 (0.0138158814047773, 'hillary'),
 (0.011232542212325548, 'donald'),
 (0.007367171195776696, 'said'),
 (0.006673959379252799, 'vote'),
 (0.006204276397775929, 'president'),
 (0.005779181800260542, 'campaign'),
 (0.004990313268932628, 'presidential')]

In [14]:
num_topics = 3
for topic_no in range(num_topics):
    print("\nTopic", str(topic_no))
    for time in range(len(time_slices)):
        print("Time slice", str(time))
        print(model.show_topic(topic_no, time, topn=10))


Topic 0
Time slice 0
[(0.008669098218929368, 'gold'), (0.006227503386173605, 'money'), (0.005947940757873672, 'year'), (0.005318989503281265, 'market'), (0.004934526867171783, 'bank'), (0.004473818736015696, 'new'), (0.004462629331330482, 'world'), (0.004329646335014066, 'said'), (0.004109784582221707, 'financial'), (0.0038180973909487936, 'percent')]
Time slice 1
[(0.008786834597120334, 'gold'), (0.006257792993489457, 'money'), (0.006019231762841592, 'year'), (0.005292442156430203, 'market'), (0.005114480241733794, 'bank'), (0.004539805903365135, 'new'), (0.004438509940075528, 'world'), (0.0043802479594249365, 'said'), (0.004140405222708584, 'financial'), (0.0038537340975632064, 'percent')]

Topic 1
Time slice 0
[(0.039834287594018344, 'trump'), (0.01461527120528731, 'clinton'), (0.014525353329559152, 'election'), (0.0138158814047773, 'hillary'), (0.011232542212325548, 'donald'), (0.007367171195776696, 'said'), (0.006673959379252799, 'vote'), (0.006204276397775929, 'president'), (0.0

# 5)-Distance between documents

In [15]:
doc = 0
model.gamma_[doc]

array([2.77008310e-04, 2.77008310e-04, 9.12585877e-01, 2.77008310e-04,
       2.77008310e-04, 2.77008310e-04, 8.51980563e-02, 2.77008310e-04,
       2.77008310e-04, 2.77008310e-04])

In [16]:
from gensim.matutils import hellinger

doc1 = 4
doc2 = 5
hellinger(model.gamma_[doc1], model.gamma_[doc2])

0.9863725954985892

# 6)-Visualization

In [17]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [18]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=0, corpus=corpus)
vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency,sort_topics=True)
vis_wrapper=pyLDAvis.display(vis_wrapper)

  nbits = re.compile('(\d+)bit').search(abits).group(1)
  "\s+stepping\s+(?P<STP>\d+)", re.IGNORECASE)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [19]:
vis_wrapper

In [21]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=1, corpus=corpus)
vis_wrapper2 = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency,sort_topics=True)
vis_wrapper2=pyLDAvis.display(vis_wrapper2)

In [22]:
vis_wrapper2