In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer, SnowballStemmer

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
print(len(documents))
print(documents[:5])

1103663
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


# How to do the preprocess
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Lemmatized: words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Stemmed: words are reduced to their root form.

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wang_cheng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos = 'v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [10]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

print("original documents: ")
words = []
for word in doc_sample.split(" "):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original documents: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [None]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

## Find out the frequency of how many times a certain word has occured

In [13]:
dictionary = gensim.corpora.Dictionary(processed_docs)
# How to convert genism data frame to dict
print(type(dictionary))

count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count += 1
    if count > 10:
        break

<class 'gensim.corpora.dictionary.Dictionary'>
0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


In [15]:
dictionary.filter_extremes(no_below = 15, no_above = 0.5,keep_n = 100000)

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]
# Result: (9,2) represents that the word with index 9 occurs 2 times in the documens

[(76, 1), (112, 1), (483, 1), (4014, 1)]

In [19]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} times.".format(bow_doc_4310[i][0],
                                                     dictionary[bow_doc_4310[i][0]],
                                                               bow_doc_4310[i][1]))

Word 76 ("bushfir") appears 1 times.
Word 112 ("help") appears 1 times.
Word 483 ("rain") appears 1 times.
Word 4014 ("dampen") appears 1 times.


In [24]:
from gensim import corpora,models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5892908867507543),
 (1, 0.38929654337861147),
 (2, 0.4964985175717023),
 (3, 0.5046520327464028)]


## Running LDA using Bag of Words


In [30]:
lda_model = gensim.models.LdaMulticore(bow_corpus,num_topics = 10,
                                      id2word = dictionary,
                                      passes = 2,
                                      workers = 2)

In [32]:
for idx,topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx,topic))

Topic: 0 
Words: 0.044*"polic" + 0.026*"charg" + 0.025*"court" + 0.019*"murder" + 0.017*"death" + 0.014*"alleg" + 0.014*"interview" + 0.013*"jail" + 0.013*"miss" + 0.013*"woman"
Topic: 1 
Words: 0.038*"elect" + 0.018*"break" + 0.015*"guilti" + 0.013*"john" + 0.012*"rule" + 0.011*"hill" + 0.011*"marriag" + 0.011*"river" + 0.011*"rise" + 0.010*"speak"
Topic: 2 
Words: 0.026*"south" + 0.025*"australian" + 0.020*"countri" + 0.017*"donald" + 0.016*"world" + 0.012*"say" + 0.012*"show" + 0.012*"forc" + 0.011*"attack" + 0.010*"korea"
Topic: 3 
Words: 0.023*"kill" + 0.019*"test" + 0.019*"die" + 0.019*"women" + 0.016*"hour" + 0.016*"dead" + 0.015*"high" + 0.015*"turnbul" + 0.014*"price" + 0.014*"crash"
Topic: 4 
Words: 0.027*"adelaid" + 0.022*"market" + 0.018*"tasmania" + 0.017*"power" + 0.016*"time" + 0.016*"share" + 0.013*"busi" + 0.013*"week" + 0.012*"farm" + 0.012*"game"
Topic: 5 
Words: 0.058*"australia" + 0.029*"queensland" + 0.026*"say" + 0.020*"warn" + 0.013*"deal" + 0.013*"china" + 0.01

## Running LDA using TF-IDF

In [43]:
# lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = 10,
#                                             id2word = dictionary, 
#                                              passes = 2, 
#                                              workers = 4)
# for idx, topic in lda_model_tfidf.print_topics(-1):
#     print("Topic: {} Word: {}".format(idx, topic))
    
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.015*"rural" + 0.013*"news" + 0.012*"elect" + 0.009*"nation" + 0.009*"turnbul" + 0.008*"labor" + 0.007*"polit" + 0.007*"govern" + 0.006*"busi" + 0.006*"john"
Topic: 1 Word: 0.011*"world" + 0.011*"australia" + 0.007*"win" + 0.007*"korea" + 0.006*"australian" + 0.006*"south" + 0.006*"august" + 0.006*"cricket" + 0.005*"smith" + 0.005*"intern"
Topic: 2 Word: 0.020*"trump" + 0.013*"kill" + 0.012*"crash" + 0.010*"dead" + 0.007*"die" + 0.007*"juli" + 0.006*"marriag" + 0.006*"attack" + 0.005*"syria" + 0.005*"islam"
Topic: 3 Word: 0.008*"rugbi" + 0.008*"octob" + 0.008*"asylum" + 0.007*"leagu" + 0.007*"seeker" + 0.006*"scott" + 0.006*"wrap" + 0.006*"obama" + 0.006*"histori" + 0.005*"round"
Topic: 4 Word: 0.011*"hill" + 0.010*"grandstand" + 0.008*"celebr" + 0.007*"final" + 0.006*"zealand" + 0.006*"april" + 0.006*"grand" + 0.006*"capit" + 0.006*"tree" + 0.006*"great"
Topic: 5 Word: 0.023*"countri" + 0.021*"hour" + 0.011*"podcast" + 0.010*"govern" + 0.008*"royal" + 0.008*"sport" + 0

In [35]:
processed_docs[4310]

['rain', 'help', 'dampen', 'bushfir']

In [36]:
for index,score in sorted(lda_model[bow_corpus[4310]], key = lambda tup: -1*tup[1]):
    print('\nScore: {}\t \nTopic: {}'.format(score, lda_model_tfidf.print_topic(index,10)))



Score: 0.42036205530166626	 
Topic: 0.007*"flood" + 0.007*"bushfir" + 0.007*"octob" + 0.006*"wednesday" + 0.006*"central" + 0.006*"health" + 0.005*"cyclon" + 0.005*"mental" + 0.005*"victoria" + 0.005*"care"

Score: 0.2201773226261139	 
Topic: 0.011*"turnbul" + 0.010*"govern" + 0.008*"live" + 0.007*"light" + 0.006*"pacif" + 0.006*"thursday" + 0.006*"malcolm" + 0.006*"decemb" + 0.005*"islam" + 0.005*"state"

Score: 0.219331756234169	 
Topic: 0.007*"say" + 0.007*"abus" + 0.007*"marriag" + 0.007*"commiss" + 0.006*"royal" + 0.006*"juli" + 0.006*"mother" + 0.006*"novemb" + 0.006*"korea" + 0.006*"sexual"

Score: 0.02001936174929142	 
Topic: 0.013*"drum" + 0.007*"tuesday" + 0.007*"august" + 0.007*"shark" + 0.007*"abbott" + 0.006*"kill" + 0.006*"updat" + 0.005*"suicid" + 0.005*"footag" + 0.005*"rememb"

Score: 0.02001914568245411	 
Topic: 0.011*"final" + 0.010*"interview" + 0.009*"world" + 0.008*"australia" + 0.007*"leagu" + 0.007*"open" + 0.006*"hobart" + 0.006*"beat" + 0.006*"friday" + 0.006

In [39]:
print(type(bow_corpus[4310]))
print(bow_corpus[4310])
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

<class 'list'>
[(76, 1), (112, 1), (483, 1), (4014, 1)]

Score: 0.6064276099205017	 
Topic: 0.007*"flood" + 0.007*"bushfir" + 0.007*"octob" + 0.006*"wednesday" + 0.006*"central" + 0.006*"health" + 0.005*"cyclon" + 0.005*"mental" + 0.005*"victoria" + 0.005*"care"

Score: 0.2334822118282318	 
Topic: 0.007*"say" + 0.007*"abus" + 0.007*"marriag" + 0.007*"commiss" + 0.006*"royal" + 0.006*"juli" + 0.006*"mother" + 0.006*"novemb" + 0.006*"korea" + 0.006*"sexual"

Score: 0.020012089982628822	 
Topic: 0.020*"rural" + 0.019*"countri" + 0.018*"hour" + 0.012*"news" + 0.011*"market" + 0.009*"podcast" + 0.008*"share" + 0.007*"busi" + 0.006*"nation" + 0.006*"street"

Score: 0.020011555403470993	 
Topic: 0.011*"turnbul" + 0.010*"govern" + 0.008*"live" + 0.007*"light" + 0.006*"pacif" + 0.006*"thursday" + 0.006*"malcolm" + 0.006*"decemb" + 0.005*"islam" + 0.005*"state"

Score: 0.020011531189084053	 
Topic: 0.009*"violenc" + 0.008*"farm" + 0.007*"climat" + 0.007*"monday" + 0.006*"dairi" + 0.006*"domest" 

In [41]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, socre in sorted(lda_model[bow_vector], key = lambda tup: -1*tup[1]):
    # 
    print("Score: {}\t Topic: {}".format(score,lda_model.print_topic(index,5)))

Score: 0.020010629668831825	 Topic: 0.058*"australia" + 0.029*"queensland" + 0.026*"say" + 0.020*"warn" + 0.013*"deal"
Score: 0.020010629668831825	 Topic: 0.026*"south" + 0.025*"australian" + 0.020*"countri" + 0.017*"donald" + 0.016*"world"
Score: 0.020010629668831825	 Topic: 0.027*"adelaid" + 0.022*"market" + 0.018*"tasmania" + 0.017*"power" + 0.016*"time"
Score: 0.020010629668831825	 Topic: 0.023*"kill" + 0.019*"test" + 0.019*"die" + 0.019*"women" + 0.016*"hour"
Score: 0.020010629668831825	 Topic: 0.025*"nation" + 0.023*"canberra" + 0.019*"rural" + 0.014*"farmer" + 0.013*"flood"
Score: 0.020010629668831825	 Topic: 0.025*"coast" + 0.025*"school" + 0.017*"water" + 0.017*"state" + 0.016*"gold"
Score: 0.020010629668831825	 Topic: 0.044*"polic" + 0.026*"charg" + 0.025*"court" + 0.019*"murder" + 0.017*"death"
Score: 0.020010629668831825	 Topic: 0.038*"elect" + 0.018*"break" + 0.015*"guilti" + 0.013*"john" + 0.012*"rule"
Score: 0.020010629668831825	 Topic: 0.028*"govern" + 0.021*"plan" + 0.