# Topic Modeling

In [1]:
from requests import get
import re

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

import string
import time
from IPython.display import clear_output
import random
import pyLDAvis.lda_model

Importing some utility functions:

In [2]:
import lda_functions

Re-doing the pre-processing steps from the prior class:

In [3]:
# import data
articles = pd.read_csv('https://github.com/Neilblund/APAN/raw/main/news_sample.csv')
articles['hyperlink']=articles.apply(axis=1, func = lambda x: f'<a href={x.url}>{x.headline}</a>')

# stripping some excess whitespace
articles['headline'] = articles.headline.str.strip()
articles.head()

Unnamed: 0,url,headline,text,date,source,year(date),hyperlink
0,https://www.cnn.com/2020/06/11/politics/donald...,Trump is getting his reopening even as the vir...,\n President Donald Trump is getting what...,2020-06-11,CNN,2020,<a href=https://www.cnn.com/2020/06/11/politic...
1,https://www.cnn.com/2020/05/07/politics/state-...,Supply shortages remain a top concern as state...,"\n Since mid-March, Washington state, one...",2020-05-07,CNN,2020,<a href=https://www.cnn.com/2020/05/07/politic...
2,https://www.cnn.com/2020/03/19/politics/mcconn...,Exclusive: McConnell defends crafting $1 trill...,\n Senate Majority Leader Mitch McConnell...,2020-03-19,CNN,2020,<a href=https://www.cnn.com/2020/03/19/politic...
3,https://www.cnn.com/2020/06/18/politics/new-yo...,New York City passes rent freeze for stabilize...,\n A rent freeze for rent-stabilized apar...,2020-06-18,CNN,2020,<a href=https://www.cnn.com/2020/06/18/politic...
4,https://www.cnn.com/2020/06/11/politics/senate...,GOP-led panel moves to remove Confederate name...,\n A Senate plan to remove names of Confe...,2020-06-11,CNN,2020,<a href=https://www.cnn.com/2020/06/11/politic...


In [4]:

text = articles.text.str.lower().reset_index().text
# tokenizer that splits words
tokenizer = RegexpTokenizer(r'\w+')
# word stemming
stemmer = SnowballStemmer("english")
# english stop words
# stem the stopwords to ensure they're removedb
eng_stopwords = [tokenizer.tokenize(s)[0] for s in  stopwords.words('english')]

def tokenize(text):   
    tokens = tokenizer.tokenize(text)
    return [stemmer.stem(token) for token in tokens if token not in eng_stopwords]

vectorizer = CountVectorizer(analyzer= "word", # unit of features are single words rather then phrases of words 
                             tokenizer = tokenize,
                             ngram_range=(0,1), # Tokens are individual words for now
                             strip_accents='unicode',
                             max_df = 0.1, # maximum number of documents in which word j occurs. 
                             min_df = .0025 # minimum number of documents in which word j occurs. 
                            )



bag_of_words = vectorizer.fit_transform(text) 
features = vectorizer.get_feature_names_out()



## Fine Tuning

Interpreting a topic model can feel disconcertingly subjective: we chose a number of topics arbitrarily, and then we interepreted and labeled the output based on an eqaully subjective intepretation of our topic-word distributions. To some extent, we just have to live with some uncertainty, but we should at least be aware of how we can make adujustments to a model and what we can do to improve them if needed.



### Adjusting the number of topics

What constitutes the "correct" number of topics for LDA is often a matter of interpretation. In general, setting a higher number of topics (50-200) may allow you to capture more nuance, while setting a lower number (15-40) can make the output easier to interpret. If you find that you have a lot of topics that don't seem to hold together, it may be a sign that you need to adjust the number and see how things look.

The one thing you can be certain about is that a model with more topics will noticeably slower than models with just a few.

In [None]:
few_topics = LatentDirichletAllocation(
                                # setting a small number of topics:
                                n_components = 3, 
                                random_state = 1231) 
result=few_topics.fit_transform(bag_of_words)

# getting top terms
topterms=lda_functions.get_top_words(few_topics, features)

### Adjusting our priors

You might notice the `topic_word_prior` and `doc_topic_priors` were added as arguments to our model. What do these do? 

- the `doc_topic_prior` parameter controls the "smoothness" of our document-topic distributions. A higher number will cause the document-topic distributions to be more evenly distributed across all of our topics. 
- the `topic_word_prior` parameter will cause the topic-word distributions to have a more even distribution across each term.

Anecdotally: the topic word priors are usually fine with their defaults, but you might find that adjusting the `doc_topic_prior` improves your results. 

Here's an example using a very low value for the `doc_topic_prior` (note that I'm only using 200 documents here to make things run quickly)


In [None]:
# Create LDA model object
k = 10
lda_low_doc_topic_prior = LatentDirichletAllocation(n_components = k, 
                                random_state = 123, 
                                doc_topic_prior = .000001) 

# Fit to just 200 documents
doctopic_low = lda_low_doc_topic_prior.fit_transform(bag_of_words[:200])

In [None]:
pd.DataFrame(doctopic_low).head().style

Compare the output to this model with a very high value for the `doc_topic_prior`:

In [None]:
k = 10
lda_high_doc_topic_prior = LatentDirichletAllocation(n_components = k, 
                                random_state = 123, 
                                doc_topic_prior = 1) 

# fitting on the first 200 documents
doctopic_high = lda_high_doc_topic_prior.fit_transform(bag_of_words[:200])

In [None]:
pd.DataFrame(doctopic_high).head().style

In a sense, the values set on the hyper-parameters reflect some pre-existing assumptions about the data: if you think that the documents in your corpus are going to focus on just one or two topics, then you should set `doc_topic_prior` to a low number. If you think each document will cover a lot of issues, then you might want to set this higher.




### How do know if we need to make adjustments?

There are some "objective" metrics that we can use 
One method that has been used to assess topic quality is called a "word intrustion" test. This works by selecting the top key words from each topic, then adding a randomly selected word to each list and asking some humans to if they can find the term that doesn't belong. If a model is generating coherent topics, you would expect people to be able to spot intruders fairly easily. 

You can try out a toy version of this with the function below:


In [None]:
def topic_intruder(lda, features, n_terms=5):
    score = []
    for topic in range(lda.components_.shape[0]):
        indices = random.sample(range(n_terms+1), n_terms+1)
        # Sorting and finding top keyword
        word_idx = np.argsort(lda.components_[topic])[::-1][:n_terms]
        keywords = [features[i] for i in word_idx]
        keywords.append(random.choice(features))
        keys = '\n'.join([str(i) + '.' + keywords[j] for i, j in enumerate(indices)] )
        print('Identify the word that does not belong\n' + keys, end='')
        a = input()
        if a =='q':
            break
        try:
            guess = indices[int(a)]
            score.append(int(guess==5))
        except:
            print('')
        clear_output(wait=False)
    return score



Here's an example when applying it to a model with bad parameters: not enough topics, unreasonably high values on the priors, and using only 200 documents for training: 

In [None]:
k = 3
bad_model =  LatentDirichletAllocation(n_components = k, 
                                random_state = 123, 
                                doc_topic_prior = 1,
                                topic_word_prior = 1
                                      ) 

# Fit using data (bag_of_words)
bad_fitted = bad_model.fit_transform(bag_of_words[:200])

In [None]:
# Enter "q" to quit
guesses=topic_intruder(bad_model, features)
# calculate the % correct across all topics
np.mean(guesses)

## Improving the pre-processing

One final area where we can potentially make improvements is by changing our pre-processing steps. We'll explore a couple of options below.

### N-grams - Adding context by creating N-grams
Obviously, reducing a document to a bag of words means losing much of its meaning - we put words in certain orders, and group words together in phrases and sentences, precisely to give them more meaning. If you follow the processing steps we've gone through so far, splitting your document into individual words, you'll end up with terms like "north" and "carolina" being handled as totally separate terms when they probably should be counted together. 

One way to address this is to break down each document similarly, but rather than treating each word as an individual unit, treat each group of 2 words, or 3 words, or n words, as a unit. We call this a "bag of n-grams," where n is the number of words in each chunk. Then you can analyze which groups of words commonly occur together (in a fixed order).

We can make this change by adjusting the `ngram_range` argument for the `CountVectorizer`

In [None]:
vectorizer = CountVectorizer(analyzer= "word", # unit of features are single words rather then phrases of words 
                            tokenizer=tokenize, # function to create tokens
                            strip_accents='unicode',
                            max_df = 0.1, # maximum number of documents in which word j occurs. 
                            min_df = .0025 # minimum number of documents in which word j occurs. 
                            )

# Creating bag of words
bag_of_ngrams = vectorizer.fit_transform(text)
ngram_features = vectorizer.get_feature_names_out()




In [None]:
# notice this is much larger!

bag_of_ngrams.shape

In [None]:
# Fitting LDA model
bigram_lda = LatentDirichletAllocation(n_components = 15, 
                                random_state=999) 
doctopic = bigram_lda.fit_transform( bag_of_ngrams )

<b style="color:red;"> Question 1: use the `get_top_words` function to find the top terms from the `bigram_lda` model. See if you spot any of the bi-grams in the list of most likely terms </b>

Or we can use the LDA visualization tools:

In [None]:
panel = pyLDAvis.lda_model.prepare(bigram_lda, bag_of_ngrams, vectorizer, mds='tsne', sort_topics=False, n_jobs = -1)
word_info = panel.topic_info

#To save panel in html
pyLDAvis.save_html(panel, 'bigram_lda.html')

### Lemmatization

Another possibility is to use lemmatization instead of stemming to trim our terms. Recall that lemmatization, unlike word-stemming, attempts to identify the actual dictionary-based root word of a term rather than hapharzardly lopping off the word endings. Its much slower, but it can improve both accuracy and readability of text models. We could simply replace our original tokenizer function with a function that splits each word and lemmatizes it. However, since this is slow, I've run the code ahead of time and stored the results in the `processed_articles.csv`, but you could run the code below to replicate the steps 

In [None]:
# functions for lemmatization with part of speech tagging. 
lemmatizer = WordNetLemmatizer()

# function that converts NLKT tags to a tag format used by wordnet
def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    lemmatized_sentence = []
    
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return ' '.join(lemmatized_sentence)


You would uncomment the code in these lines to actually do the lemmatization. 

In [None]:
# then just run this to add to the existing column:
# articles['lemmatized'] =  [lemmatize_sentence(i) for i in articles.text]
# and then save your results: 
# articles.to_csv('processed_articles.csv')


In [None]:
lemmatized_articles =pd.read_csv("processed_articles.csv")
articles = articles.merge(lemmatized_articles, on='url')

In [None]:
# take a look at the lemmatized results
articles.lemmatized[1][:300]

Now we'll apply the `CountVectorizer` function again and train a model using unigrams and bigrams on the lemmatized data:


In [None]:
vectorizer = CountVectorizer(analyzer= "word", # unit of features are single words rather then phrases of words 
                            ngram_range=(0,2), # Allow for bigrams
                            strip_accents='unicode',
                            stop_words = eng_stopwords,
                            max_df = 0.1, # maximum number of documents in which word j occurs. 
                            min_df = .0025 # minimum number of documents in which word j occurs. 
                            )

# Creating bag of words
bag_of_lemma = vectorizer.fit_transform(lemmatized_articles.lemmatized) 
lemma_features = vectorizer.get_feature_names_out()

In [None]:
bag_of_lemma.shape

In [None]:
# Fitting LDA model
lemma_lda = LatentDirichletAllocation(n_components = 15, 
                                     random_state=999) 
doctopic = lemma_lda.fit_transform( bag_of_lemma )

In [None]:
top_lemma = lda_functions.get_top_words(lemma_lda, lemma_features)

In [None]:
lda_functions.get_top_docs(doctopic, n_docs=2, docnames=articles.hyperlink).style

In [None]:
panel = pyLDAvis.lda_model.prepare(lemma_lda, bag_of_lemma, vectorizer, mds='tsne', sort_topics=False, n_jobs = -1)
word_info = panel.topic_info

#To save panel in html
pyLDAvis.save_html(panel, 'lemma_lda.html')

### TF-IDF - Weighting terms based on frequency

One additional step we can add in cleaning and processing our text data is **Term Frequency-Inverse Document Frequency (TF-IDF)**. TF-IDF is based on the idea that the words (or terms) that are most related to a certain topic will occur frequently in documents on that topic, and infrequently in unrelated documents.  TF-IDF re-weights words so that we emphasize words that are unique to a document and suppress words that are common throughout the corpus by inversely weighting terms based on their frequency within the document and across the corpus.

Recall that our data might look something like this:

|document ID|about|america|author|ask|...|
|-|-|-|-|-|-|
|1|0|0|0|0|...|
|2|0|1|0|0|...|
|3|0|0|3|0|...|
|4|1|0|0|0|...|
|5|0|0|0|2|...|
|...|...|...|...|...|...|

The values that are in the cells are the term frequencies. TF-IDF takes those values and re-weights them by the inverse of how often they occur in other documents. So, for example, if the term occurs in many other documents, the term frequency would be close to 1 (since the fraction of documents the term occurs in is close to 1). However, if the term occurs only in a smaller fraction of documents (such as 1/10th of documents), then the term frequency is multiplied by a much larger number (since we use the inverse document frequency).

Let's look at how to use TF-IDF:


In [None]:
# Use TfidfTransformer to re-weight bag of words 
transformer = TfidfTransformer(norm = None, smooth_idf = True, sublinear_tf = True)
tfidf = transformer.fit_transform(bag_of_lemma)

# Fitting LDA model
tf_lda = LatentDirichletAllocation(n_components = 15, learning_method='online') 
doctopic = tf_lda.fit_transform(tfidf)

<b style="color:red;"> Question 2: use the `get_top_words` function to find the top terms from the `tf_lda` model. What differences, if any, do you notice?

# Example of a grid search

As I mentioned above: there's really no generally agreed-upon objective method for assessing the quality of a topic model, but there are some commonly used metrics. 

If you're interested in trying to optimize a model (and you have an hour or so to burn) you can try using a grid-search to run multiple models and compare them in terms of perplexity (which measures how well the model predicts words) and coherence (which measures how often words in the same topic appear together in documents). 

Scikitlearn doesn't have great support for either of these methods, but you can do it with the `tmtoolkit` module.


In [None]:
# install if you don't already have it
%pip install -U "tmtoolkit[recommended]"


In [None]:
from tmtoolkit.topicmod import tm_sklearn
from tmtoolkit.topicmod.visualize import plot_eval_results
from tmtoolkit.topicmod.evaluate import results_by_parameter


# set constant parameters: these won't change 
const_params = {
    'random_state': 123,  # to make results reproducible
}

# set varying parameters - these will change and be compared from one model to the next.
# keep in mind that more variations means more time! 
var_params = [{'n_components': k, 'doc_topic_prior':1/k}
              # testing from 5 to 50 topics: 
               for k in range(5, 55, 5)]
var_params


In [None]:
# train multiple models 
out = tm_sklearn.evaluate_topic_models(bag_of_words, 
                                       varying_parameters=var_params, 
                                       constant_parameters=const_params,
                                       return_models=True)

In [None]:
# view results
eval_results_by_topics = results_by_parameter(out, 'n_components')

eval_results_by_topics

In [None]:
# plot results. Look for lower values of perplexity and higher values of coherence. 
plot_eval_results(eval_results_by_topics)


In [None]:
# the fifth model (k = 30) looks like the best performer, so take it out of the results and use it for other analyses
best_model = out[5][1]['model']

In [None]:
lda_functions.get_top_words(best_model, 5)

In [None]:
lda_functions.get_top_docs(doctopics, 5, list(articles.hyperlink)).style