In [0]:
import pandas as pd
import numpy as np

In [0]:
data=pd.read_csv("abcnews-date-text.csv",error_bad_lines=False)

In [0]:
print("Shape of the Data",data.shape)
data.head()

Here we are going to apply LDA to a set of documents and split them into topics. Let’s get started!

In [0]:
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text


**Now preprocessing:**

   Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
  Words that have fewer than 3 characters are removed.
    All stopwords are removed.
    Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
    Words are stemmed — words are reduced to their root form.



In [0]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')

In [0]:
#funtion to lammenatize and stem steps

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [0]:
sample_doc = documents[documents['index'] == 79].values[0][0]

words=[]
for word in sample_doc.split(" "):
  words.append(word)
  
print(words)
print("Words after preprocessing")
print(preprocess(sample_doc))

In [0]:
processed_data=documents["headline_text"].map(preprocess)
processed_data[:9]

**Bag of words on the Dataset**

Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [0]:
dictionary=gensim.corpora.Dictionary(processed_data)

count=0
for k, v in dictionary.iteritems():
  print(k,v)
  count+=1
  if count >10:
    break
  

Gensim filter_extremes

Filter out tokens that appear in

    less than 15 documents (absolute number) or
    more than 0.5 documents (fraction of total corpus size, not absolute number).
    after the above two steps, keep only the first 100000 most frequent tokens.

In [0]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

Gensim doc2bow

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [0]:
bow_corpus=[dictionary.doc2bow(doc) for doc in processed_data]
bow_corpus[79]

In [0]:
bow_doc_79 = bow_corpus[79]

for i in range(len(bow_doc_79)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_79[i][0], 
                                               dictionary[bow_doc_79[i][0]], 
bow_doc_79[i][1]))

TF-IDF

Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [0]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

Running LDA using Bag of Words

Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [0]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

For each topic, we will explore the words occuring in that topic and its relative weight.

In [0]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Can you distinguish different topics using the words in each topic and their corresponding weights?

Running LDA using TF-IDF

In [0]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Performance evaluation by classifying sample document using LDA Bag of Words model

We will check where our test document would be classified.

In [0]:
processed_data[79]

In [0]:
for index, score in sorted(lda_model[bow_corpus[79]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

Performance evaluation by classifying sample document using LDA TF-IDF model.

In [0]:
for index, score in sorted(lda_model_tfidf[bow_corpus[79]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [0]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))