## Topic Modeling with Gensim
Topic Modeling is a technique to extract the hidden topics from large volumes of text. Latent Dirichlet Allocation(LDA) is a popular algorithm for topic modeling with excellent implementations in the Python’s Gensim package.

ref: https://www.machinelearningplus.com/nlp/ 

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.corpus import wordnet


import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np
from pprint import pprint

np.random.seed(2018)

In [15]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatization(texts):
    # Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    texts_out = []
    for sent in texts:
        texts_out.append([lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in sent])
    return texts_out

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [16]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
#pprint(stop_words)

In [18]:
headers = pd.read_csv('abcnews-2017.csv', error_bad_lines=False);

In [19]:
headers[:5]

Unnamed: 0,headline_text,date
0,1986 queensland cabinet documents released,20170101
1,actor william cristopher mash chaplain dead at 84,20170101
2,adelaide hip hop artist dreams becoming influe...,20170101
3,adelaide's new year's fireworks,20170101
4,adelaides new years fireworks,20170101


### Data Preprocessing

#### Lemmatize example

In [20]:
# Convert to list
data = headers.headline_text.values.tolist()
pprint(data[:1])

['1986 queensland cabinet documents released']


## Tokenize words

In [21]:
data_words = list(sent_to_words(data))
pprint(data_words[:2])

[['queensland', 'cabinet', 'documents', 'released'],
 ['actor', 'william', 'cristopher', 'mash', 'chaplain', 'dead', 'at']]


## Remove Stopwords

In [22]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
pprint(data_words_nostops[:2])

[['queensland', 'cabinet', 'documents', 'released'],
 ['actor', 'william', 'cristopher', 'mash', 'chaplain', 'dead']]


In [23]:
data_lemmatized = lemmatization(data_words_nostops)
pprint(data_lemmatized[:2])

[['queensland', 'cabinet', 'document', 'release'],
 ['actor', 'william', 'cristopher', 'mash', 'chaplain', 'dead']]


## Create the Dictionary and Corpus for Topic Modeling

In [24]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
pprint(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


In [25]:
# If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.
id2word[0]

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('cabinet', 1), ('document', 1), ('queensland', 1), ('release', 1)]]

## Building the Topic Model

In [26]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

## List the topics in LDA model

In [27]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.058*"australia" + 0.034*"charge" + 0.029*"nsw" + 0.020*"report" + '
  '0.018*"murder" + 0.016*"south" + 0.015*"bank" + 0.015*"test" + '
  '0.013*"scorecentre" + 0.013*"canberra"'),
 (1,
  '0.050*"say" + 0.050*"police" + 0.030*"man" + 0.022*"take" + 0.019*"cup" + '
  '0.017*"queensland" + 0.017*"show" + 0.014*"rise" + 0.013*"car" + '
  '0.013*"two"'),
 (2,
  '0.034*"new" + 0.025*"woman" + 0.019*"year" + 0.017*"world" + 0.016*"plan" + '
  '0.016*"state" + 0.013*"first" + 0.011*"house" + 0.011*"dy" + '
  '0.010*"victoria"'),
 (3,
  '0.027*"fire" + 0.022*"u" + 0.018*"may" + 0.016*"afl" + 0.016*"found" + '
  '0.015*"help" + 0.013*"high" + 0.013*"melbourne" + 0.013*"find" + '
  '0.012*"uk"'),
 (4,
  '0.043*"sydney" + 0.039*"attack" + 0.034*"could" + 0.033*"court" + '
  '0.023*"change" + 0.021*"hospital" + 0.021*"go" + 0.018*"dog" + '
  '0.014*"record" + 0.012*"country"'),
 (5,
  '0.025*"fear" + 0.025*"news" + 0.025*"end" + 0.019*"put" + 0.019*"concern" + '
  '0.018*"war" + 0.017*"c

## Coherence Score

In [30]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.096813106869256

Coherence Score:  0.4989343331617513


## Visualize the topics-keywords

In [33]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
