**Simple Topic Modeling for DH**

Jo, Eun Seo for ESUDH2018

In this unit, we'll go through one example of LDA topic modeling on a corpus of news headlines with gensim.

In [None]:
import pandas

In [None]:
#Using data set of headlines from: https://www.kaggle.com/therohk/million-headlines/version/6#
csv = pandas.read_csv('~/Downloads/abcnews-date-text.csv')

In [None]:
csv.columns

In [None]:
headlines = csv["headline_text"].tolist()
headlines

In [None]:
dates = csv["publish_date"].tolist()

In [None]:
print("We have",len(csv), "items total")

In [None]:
#Keeping only the year & month (to simplify this example)
years = [int(date/10000) for date in dates]


In [None]:
#First have to clean your data so that you have
# 1. tokenize data
# 2. no stop words
#(our headlines are already lower-cased)


#first, stop words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

eng_stop = stopwords.words("english")

cleaned_headlines = []
for headline in headlines:
    headline.lower()
    tokens = [token for token in word_tokenize(headline) if token not in eng_stop]
    cleaned_headlines.append(tokens)



In [None]:
#now our headlines look like this
cleaned_headlines[10:30]

In [None]:
#import gensim
import gensim

In [None]:
from gensim.corpora.dictionary import Dictionary

In [None]:
our_dictionary = Dictionary(cleaned_headlines) 
#this gives a unique integer ID for each word in our headlines dataset

In [None]:
numbered_headlines = [our_dictionary.doc2bow(text) for text in cleaned_headlines]

In [None]:
numbered_headlines[10:30] #tuples where (ID, Freq)

In [None]:
headlines_lda = gensim.models.ldamodel.LdaModel(numbered_headlines, num_topics=50) #you set your number of topics

In [None]:
#"You've trained your LDA model!"

In [None]:
headlines_lda.save('lda_save')

In [None]:
headlines_lda = gensim.models.ldamodel.LdaModel.load('lda_save')

In [None]:

headlines_lda.get_topic_terms

In [None]:
topic_matrix = headlines_lda.get_topics()

In [None]:
topic_matrix.shape #no. topics x vocab size

In [None]:
topic_matrix

In [None]:
np.sum(topic_matrix, axis=1) #sum across all vocab for each topic

In [None]:
import numpy as np
np.sum(topic_matrix[0,:])

In [None]:
#top 20 words by distribution in topic no. 7
top20 = headlines_lda.get_topic_terms(7, topn=20)

In [None]:
for w,p in top20:
    print(our_dictionary[w], p)

In [None]:
headlines_lda.show_topic(5,topn=2)#

In [None]:
from collections import Counter

In [None]:
counter = Counter()
for headline in numbered_headlines:
    d = {k:v for k,v in headline}
    print(d)
    counter += d

In [None]:
top20 = headlines_lda.get_topic_terms(5, topn=20)