# A Million News Headlines Dataset -Kaggle

In [1]:
import pandas as pd



In [2]:
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text


In [3]:
len(documents)

1186018

In [4]:
documents[:5]

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


## Data Preprocessing

In [14]:
import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

### function to perform lemmatize and stem preprocessing steps on the data set.

In [15]:
stemmer = PorterStemmer()

In [16]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


In [17]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

gensim.utils.simple_preprocess

-Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.

In [18]:
# select a index from the document to check if it worked


In [19]:
doc_sample = documents[documents['index'] == 2].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['a', 'g', 'calls', 'for', 'infrastructure', 'protection', 'summit']


 tokenized and lemmatized document: 
['call', 'infrastructur', 'protect', 'summit']


In [20]:
### pass the headline_text of documents to the preprocess function

In [21]:
processed_docs = documents['headline_text'].map(preprocess)

In [24]:
processed_docs[:5]

0        [decid, commun, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

# Bag of Words on the Data set

Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [28]:
from gensim import corpora

In [29]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [30]:
dictionary


<gensim.corpora.dictionary.Dictionary at 0x26594392e48>

In [32]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 broadcast
1 commun
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit


 ## Gensim filter_extremes
 
 
 Filter out tokens that appear in
 
 
less than 10 documents (absolute number) or


more than 0.5 documents (fraction of total corpus size, not absolute number).


after the above two steps, keep only the first 100000 most frequent tokens.

In [33]:
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)

## Gensim doc2bow

Convert document into the bag-of-words (BoW) format

In [34]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[2]

[(7, 1), (8, 1), (9, 1), (10, 1)]

In [35]:

bow_doc_2 = bow_corpus[2]

for i in range(len(bow_doc_2)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], 
                                                     dictionary[bow_doc_2[i][0]], 
                                                     bow_doc_2[i][1]))

Word 7 ("call") appears 1 time.
Word 8 ("infrastructur") appears 1 time.
Word 9 ("protect") appears 1 time.
Word 10 ("summit") appears 1 time.


# TF-IDF Model

In [36]:
from gensim import corpora, models

In [37]:
tfidf = models.TfidfModel(bow_corpus)

In [38]:
corpus_tfidf = tfidf[bow_corpus]

In [39]:
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5860586362613176),
 (1, 0.3854657616151764),
 (2, 0.5006618583937537),
 (3, 0.5072367544211179)]


# Running LDA using Bag of Words


Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

print_topics - Alias for show_topics() that prints the topn most probable words for topics number of topics to log. Set topics=-1 to print all topics.


For each topic, we will explore the words occuring in that topic and its relative weight.

In [40]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [41]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.033*"queensland" + 0.022*"perth" + 0.018*"island" + 0.017*"speak" + 0.016*"interview" + 0.014*"releas" + 0.013*"assault" + 0.013*"prison" + 0.013*"stori" + 0.012*"scott"
Topic: 1 
Words: 0.048*"australia" + 0.034*"trump" + 0.023*"world" + 0.017*"donald" + 0.014*"test" + 0.013*"final" + 0.012*"miss" + 0.012*"women" + 0.011*"win" + 0.010*"guilti"
Topic: 2 
Words: 0.026*"kill" + 0.023*"china" + 0.022*"south" + 0.021*"news" + 0.020*"north" + 0.018*"tasmania" + 0.016*"australian" + 0.012*"west" + 0.012*"talk" + 0.011*"say"
Topic: 3 
Words: 0.023*"market" + 0.016*"water" + 0.014*"rise" + 0.014*"bank" + 0.013*"price" + 0.012*"victoria" + 0.012*"fall" + 0.012*"australian" + 0.011*"drum" + 0.011*"million"
Topic: 4 
Words: 0.033*"year" + 0.027*"court" + 0.018*"murder" + 0.018*"face" + 0.016*"jail" + 0.015*"accus" + 0.014*"brisban" + 0.014*"famili" + 0.014*"peopl" + 0.014*"alleg"
Topic: 5 
Words: 0.034*"elect" + 0.024*"crash" + 0.021*"canberra" + 0.020*"die" + 0.015*"bushfir" +

# Running LDA using TF-IDF

In [42]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [43]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.032*"trump" + 0.009*"miss" + 0.009*"christma" + 0.008*"search" + 0.007*"septemb" + 0.006*"plane" + 0.005*"adelaid" + 0.005*"beach" + 0.005*"island" + 0.005*"johnson"
Topic: 1 Word: 0.019*"market" + 0.011*"royal" + 0.009*"street" + 0.009*"share" + 0.008*"turnbul" + 0.008*"john" + 0.008*"wednesday" + 0.008*"busi" + 0.007*"china" + 0.007*"commiss"
Topic: 2 Word: 0.018*"charg" + 0.016*"murder" + 0.013*"polic" + 0.013*"court" + 0.011*"interview" + 0.010*"stori" + 0.010*"jail" + 0.010*"alleg" + 0.009*"woman" + 0.009*"guilti"
Topic: 3 Word: 0.008*"bushfir" + 0.007*"financ" + 0.006*"million" + 0.006*"ash" + 0.006*"april" + 0.006*"award" + 0.005*"western" + 0.005*"onlin" + 0.005*"coast" + 0.005*"firefight"
Topic: 4 Word: 0.013*"donald" + 0.008*"farm" + 0.007*"drought" + 0.006*"grandstand" + 0.006*"peter" + 0.006*"water" + 0.005*"farmer" + 0.005*"plan" + 0.005*"fund" + 0.005*"novemb"
Topic: 5 Word: 0.012*"drum" + 0.009*"tuesday" + 0.009*"monday" + 0.009*"flood" + 0.008*"violenc"