In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/million-headlines/abcnews-date-text.csv


# First, load and examine the data

In [2]:
import pandas as pd

data = pd.read_csv('/kaggle/input/million-headlines/abcnews-date-text.csv', error_bad_lines=False)
data['publish_date'] = pd.to_datetime(data.publish_date, format='%Y%m%d')
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [8]:
data.head()

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [15]:
data.describe()

  """Entry point for launching an IPython kernel.


Unnamed: 0,publish_date,headline_text
count,1226258,1226258
unique,6517,1195191
top,2012-08-24 00:00:00,national rural news
freq,384,983
first,2003-02-19 00:00:00,
last,2020-12-31 00:00:00,


In [9]:
documents.head()

Unnamed: 0,headline_text,index
0,aba decides against community broadcasting lic...,0
1,act fire witnesses must be aware of defamation,1
2,a g calls for infrastructure protection summit,2
3,air nz staff in aust strike for pay rise,3
4,air nz strike to affect australian travellers,4


# Data Preprossing 
* Tokenization: Split the headline text into sentences and sentences into words. Lowercase the words and remove punctuation
* Words that have fewer than 3 characters are removed
* All stopwords are removed
* Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present
* Words are stemmed — words are reduced to their root form (plural to single) 

In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# An example of lemmatisation 
print(WordNetLemmatizer().lemmatize('gone', pos='v'))

go


In [4]:
# An example of stemming 
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [5]:
# Create a function that puts together lemmatisation and stemming 
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [6]:
# Create a function to split sentences into words, remove stopwords and any words that have less than 3 characters 
# Pass the result into the lemmatize_stemming function defined above to retrieve its root form
STOPWORDS = gensim.parsing.preprocessing.STOPWORDS
def preprocess(headline):
    result = []
    for token in gensim.utils.simple_preprocess(headline):
        if token not in STOPWORDS and len(token) >= 3:
            result.append(lemmatize_stemming(token))
    return result

In [37]:
# Test whether the function works or not 
doc_sample = documents[documents['index'] == 4300].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['poultry', 'firm', 'drops', 'building', 'buy', 'plan']


 tokenized and lemmatized document: 
['poultri', 'firm', 'drop', 'build', 'buy', 'plan']


In [7]:
preprocess_docs = documents.headline_text.map(preprocess)
pd.DataFrame(data = {'Original headline': documents.headline_text[:5], 'Processed headline': preprocess_docs[:5]})

Unnamed: 0,Original headline,Processed headline
0,aba decides against community broadcasting lic...,"[aba, decid, communiti, broadcast, licenc]"
1,act fire witnesses must be aware of defamation,"[act, wit, awar, defam]"
2,a g calls for infrastructure protection summit,"[call, infrastructur, protect, summit]"
3,air nz staff in aust strike for pay rise,"[air, staff, aust, strike, pay, rise]"
4,air nz strike to affect australian travellers,"[air, strike, affect, australian, travel]"


# Create Bag of Words 
* Create a word dictionary where unqiue words across all documents are represented by unique indexes 
* Since there are lots of unique words, we filter out words that are either too common or too rare 
* Count the number of times a word appears for each headline 
  * Example: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)] for the first headline 

In [23]:
# create a word dictionary where unique words in all headlines are represented by unique indexes 
word_dict = gensim.corpora.Dictionary(preprocess_docs)
# print the first 5 words in this word dictionary 
count = 0 
for index, word in word_dict.iteritems():
    print(index, word)
    count+=1
    if count > 5:
        break

0 aba
1 broadcast
2 communiti
3 decid
4 licenc
5 act


In [25]:
# Filter out tokens that appear in
# less than 15 documents (absolute number) or
# more than 0.5 documents (fraction of total corpus size, not absolute number).
# after the above two steps, keep only the first 100000 most frequent tokens
print("length of word dictionary before filtering is {}".format(len(word_dict)))
word_dict.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
print("length of word dictionary after filtering is {}".format(len(word_dict)))

length of word dictionary before filtering is 71559
length of word dictionary after filtering is 15952


In [29]:
# Count word frequency 
bow_corpus = [word_dict.doc2bow(doc) for doc in preprocess_docs]
# Take document 4310 as an example 
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time in document 4310.".format(bow_doc_4310[i][0], 
                                                     word_dict[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 173 ("govt") appears 1 time in document 4310.
Word 259 ("group") appears 1 time in document 4310.
Word 311 ("vote") appears 1 time in document 4310.
Word 633 ("local") appears 1 time in document 4310.
Word 895 ("want") appears 1 time in document 4310.
Word 3742 ("compulsori") appears 1 time in document 4310.
Word 3743 ("ratepay") appears 1 time in document 4310.


In [30]:
# Examine word counts on the first 5 headlines 
bow_corpus[:5]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(13, 1), (18, 1), (19, 1), (20, 1), (21, 1)]]

# TF-IDF Transformation

* Use TF-IDF to assign more weights on words that are only unique to a headline and put less weights on words that are common across all headlines 

In [37]:
# Run ti-idf on bag of words 
from gensim import models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
# Examine word frequency on first 5 headlines 
count = 0
for doc in corpus_tfidf:
    pprint(doc)
    count+=1
    if count >= 5:
        break

[(0, 0.5961221629567331),
 (1, 0.4691066873890528),
 (2, 0.31151361288073415),
 (3, 0.4021230894767581),
 (4, 0.4072266845116059)]
[(5, 0.3782290788329865),
 (6, 0.5556888380919022),
 (7, 0.5701091938749195),
 (8, 0.47236446331674)]
[(9, 0.38317854322330597),
 (10, 0.5642748994658622),
 (11, 0.4730298749725829),
 (12, 0.5576834041187516)]
[(13, 0.41285477758966027),
 (14, 0.44167870536827114),
 (15, 0.3851962507305321),
 (16, 0.35945495237039954),
 (17, 0.44280111873520456),
 (18, 0.4010162200360306)]
[(13, 0.4355460552411905),
 (18, 0.4230568282245323),
 (19, 0.5127865974620833),
 (20, 0.3446105674086784),
 (21, 0.49961586859302676)]


# Run LDA on TF-IDF

* Classify all headlines into 10 topics (10 is an arbitrary number) 
* Extract words and associated probabilities that defined the topics 

In [39]:
# id2word: mapping from words to ids 
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=word_dict, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"royal" + 0.010*"elect" + 0.008*"commiss" + 0.007*"financ" + 0.007*"scott" + 0.007*"lockdown" + 0.007*"david" + 0.006*"explain" + 0.006*"liber" + 0.006*"feder"
Topic: 1 Word: 0.021*"man" + 0.018*"polic" + 0.015*"charg" + 0.013*"murder" + 0.012*"woman" + 0.011*"crash" + 0.010*"court" + 0.009*"alleg" + 0.009*"death" + 0.009*"car"
Topic: 2 Word: 0.016*"coast" + 0.016*"north" + 0.016*"south" + 0.015*"interview" + 0.011*"gold" + 0.010*"west" + 0.008*"queensland" + 0.007*"korea" + 0.007*"australia" + 0.006*"daniel"
Topic: 3 Word: 0.016*"donald" + 0.014*"market" + 0.008*"coronavirus" + 0.008*"friday" + 0.008*"rise" + 0.008*"monday" + 0.008*"price" + 0.007*"share" + 0.007*"christma" + 0.007*"australian"
Topic: 4 Word: 0.009*"afl" + 0.009*"australia" + 0.008*"final" + 0.008*"win" + 0.007*"nrl" + 0.007*"day" + 0.006*"live" + 0.006*"beat" + 0.006*"open" + 0.006*"zealand"
Topic: 5 Word: 0.011*"border" + 0.009*"morrison" + 0.008*"stori" + 0.008*"updat" + 0.007*"coronavirus" + 0

# Test the LDA model on an unseen headline 

* Print the probability of this unseen headline belonging to this 10 topics and 5 words that defined those topics

In [41]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = word_dict.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.45529210567474365	 Topic: 0.016*"donald" + 0.014*"market" + 0.008*"coronavirus" + 0.008*"friday" + 0.008*"rise"
Score: 0.24213731288909912	 Topic: 0.023*"news" + 0.014*"abc" + 0.012*"rural" + 0.009*"climat" + 0.009*"thursday"
Score: 0.1858055591583252	 Topic: 0.021*"man" + 0.018*"polic" + 0.015*"charg" + 0.013*"murder" + 0.012*"woman"
Score: 0.01668430119752884	 Topic: 0.025*"coronavirus" + 0.015*"covid" + 0.009*"govern" + 0.009*"health" + 0.008*"countri"
Score: 0.016681842505931854	 Topic: 0.011*"border" + 0.009*"morrison" + 0.008*"stori" + 0.008*"updat" + 0.007*"coronavirus"
Score: 0.01668153516948223	 Topic: 0.010*"royal" + 0.010*"elect" + 0.008*"commiss" + 0.007*"financ" + 0.007*"scott"
Score: 0.016680486500263214	 Topic: 0.035*"trump" + 0.016*"drum" + 0.010*"tuesday" + 0.009*"michael" + 0.007*"coal"
Score: 0.01667928136885166	 Topic: 0.016*"coast" + 0.016*"north" + 0.016*"south" + 0.015*"interview" + 0.011*"gold"
Score: 0.01667889393866062	 Topic: 0.010*"bushfir" + 0.009*