# Topic Modeling

In [None]:
# Versions used:
#
# python       3.9.7
# fasttext     0.9.2
# matplotlib   3.5.0
# numpy        1.20.3
# pandas       1.3.5
# bertopic     0.9.4
# gensim       4.1.2
# hdbscan      0.8.27
# scikit-learn 1.0.2
# tqdm         4.62.3
# umap-learn   0.5.2

import datetime as dt
import re
import requests as req
from types import MethodType
from typing import List, Union

import fasttext.util
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bertopic import BERTopic
from bertopic._utils import check_documents_type, check_is_fitted
from gensim.corpora import Dictionary
from gensim.models.fasttext import load_facebook_vectors
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from tqdm import tqdm
from umap import UMAP

from topic_model_diversity.diversity_metrics import irbo, word_embedding_irbo

## Dependencies

In [None]:
# Get stopwords list from Stanford CoreNLP
url = 'https://raw.githubusercontent.com/stanfordnlp/CoreNLP/main/data/edu/stanford/nlp/patterns/surface/stopwords.txt'
res = req.get(url)

# Save stopwords list to a local TXT file
file = open('./stopwords.txt', 'w')
file.write(res.text)
file.close()

# Download fastText model
# PS: note that will require a lot of disk space (~12GB)
fasttext.util.download_model('en', if_exists='ignore')

## Loading the Data

In [None]:
# Read CSV file containing timestamps, docs and upvotes
posts_df = pd.read_csv('./rcollege_clean_20200101-20220101_praw.csv')

# For posts_df, save timestamps, docs and upvotes to separate lists
timestamps = posts_df.created_utc.to_list()
docs = posts_df.document.to_list()
upvotes = posts_df.score.to_list()

# Order rows by date
posts_df = posts_df.set_index(posts_df['created_utc'])
posts_df = posts_df.sort_index()

# Create a DataFrame for each year
# Where y1_df corresponds to 2020, and y2_df to 2021
y1_df = posts_df['2020-01-01':'2020-12-31']
y2_df = posts_df['2021-01-01':]

# For y1_df, save timestamps, docs and upvotes to separate lists
y1_timestamps = y1_df.created_utc.to_list()
y1_docs = y1_df.document.to_list()
y1_upvotes = y1_df.score.to_list()

# For y2_df, save timestamps, docs and upvotes to separate lists
y2_timestamps = y2_df.created_utc.to_list()
y2_docs = y2_df.document.to_list()
y2_upvotes = y2_df.score.to_list()

## Defining stop_words

In [None]:
# Define a stop_words list and populate it with initial values
stop_words = ["aren", "couldn", "didn", "doesn",
              "don", "hadn", "hasn", "haven",
              "isn", "mustn", "shan", "shouldn",
              "wasn", "weren", "won", "wouldn"]

# Open the TXT file saved two steps above
file = open('./stopwords.txt','r')

# Append the words from the file to the stop_words list defined above
for line in file:
    stop_words.append(line.strip())

# Close the TXT file
file.close()

## Defining UMAP, HDBSCAN and CountVectorizer

https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html

https://umap-learn.readthedocs.io/en/latest/parameters.html  
https://umap-learn.readthedocs.io/en/latest/reproducibility.html

https://hdbscan.readthedocs.io/en/latest/parameter_selection.html

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
# Define UMAP parameters
umap_model = UMAP(n_neighbors=28,  # Default: 15
                  n_components=12,  # Default: 5
                  metric='cosine',
                  min_dist=0.0,
                  random_state=42)

# Define HDBSCAN parameters
hdbscan_model = HDBSCAN(min_cluster_size=17,  # Equals to min_topic_size (Default: 10)
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True)

# Define CountVectorizer parameters
vectorizer_model = CountVectorizer(tokenizer=None,
                                   stop_words=stop_words,
                                   ngram_range=(1,1))

## Topic Modeling with BERTopic

https://maartengr.github.io/BERTopic/api/bertopic.html

### Training the model

In [None]:
# Define BERTopic parameters and fit the model
topic_model = BERTopic(language='english',
                       top_n_words=10,
                       min_topic_size=17,  # Default: 10
                       nr_topics=None,
                       verbose=True,
                       embedding_model='all-MiniLM-L6-v2',  # Default: all-MiniLM-L6-v2
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model).fit(docs)

In [None]:
# Transform for docs
topics, probs = topic_model.transform(docs)

### Extracting topics

In [None]:
# Transform for y1_docs
y1_topics, y1_probs = topic_model.transform(y1_docs)

In [None]:
# Transform for y2_docs
y2_topics, y2_probs = topic_model.transform(y2_docs)

## Topic Analysis

In [None]:
topic_model.get_topic_info()

In [None]:
freq = topic_model.get_topic_info()

freq.to_csv('./vis_freq_all.csv', columns=list(freq.axes[1]), header=True, index=False)

In [None]:
freq_reps = freq
list_reps = [-1]

for i in range(161):
    reps = topic_model.get_representative_docs(topic=i)
    list_reps.append(reps)

freq_reps["Representative Docs"] = list_reps

freq_reps.to_csv('./vis_freq_reps_all.csv', columns=list(freq_reps.axes[1]), header=True, index=False)

In [None]:
freq_tops = pd.read_csv('./vis_freq_reps_all.csv')
list_tops = [-1]

for i in range(161):
    tops = topic_model.get_topic(i)
    list_tops.append(tops)

freq_tops["Words"] = list_tops

freq_tops.to_csv('./vis_freq_tops_all.csv', columns=list(freq_tops.axes[1]), header=True, index=False)

In [None]:
topic_docs_df = pd.DataFrame(list(zip(topics, timestamps, docs, upvotes)), columns=["Topic", "Date", "Document", "Upvotes"])

topic_docs_df.to_csv('./vis_topic_docs_all.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

In [None]:
topic_docs_df = topic_docs_df.set_index(topic_docs_df["Date"])
topic_docs_df = topic_docs_df.sort_index()

topic_docs_37_df = topic_docs_df.loc[topic_docs_df["Topic"] == 37]
topic_docs_37_df.to_csv('./vis_topic_docs_37.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_64_df = topic_docs_df.loc[topic_docs_df["Topic"] == 64]
topic_docs_64_df.to_csv('./vis_topic_docs_64.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_100_df = topic_docs_df.loc[topic_docs_df["Topic"] == 100]
topic_docs_100_df.to_csv('./vis_topic_docs_100.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_106_df = topic_docs_df.loc[topic_docs_df["Topic"] == 106]
topic_docs_106_df.to_csv('./vis_topic_docs_106.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_123_df = topic_docs_df.loc[topic_docs_df["Topic"] == 123]
topic_docs_123_df.to_csv('./vis_topic_docs_123.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_127_df = topic_docs_df.loc[topic_docs_df["Topic"] == 127]
topic_docs_127_df.to_csv('./vis_topic_docs_127.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_139_df = topic_docs_df.loc[topic_docs_df["Topic"] == 139]
topic_docs_139_df.to_csv('./vis_topic_docs_139.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_141_df = topic_docs_df.loc[topic_docs_df["Topic"] == 141]
topic_docs_141_df.to_csv('./vis_topic_docs_141.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_156_df = topic_docs_df.loc[topic_docs_df["Topic"] == 156]
topic_docs_156_df.to_csv('./vis_topic_docs_156.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_159_df = topic_docs_df.loc[topic_docs_df["Topic"] == 159]
topic_docs_159_df.to_csv('./vis_topic_docs_159.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

topic_docs_160_df = topic_docs_df.loc[topic_docs_df["Topic"] == 160]
topic_docs_160_df.to_csv('./vis_topic_docs_160.csv', columns=list(topic_docs_df.axes[1]), header=True, index=False)

In [None]:
freq.head(30)

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_representative_docs(topic=0)

In [None]:
similar_topics, similarity = topic_model.find_topics("posture", top_n=5)

topic_model.get_topic(similar_topics[0])

# Topic Evaluation

### Normalized Pointwise Mutual Information (NPMI)

In [None]:
# Pre-process documents before topic evaluation
documents = pd.DataFrame({'Document': docs,
                          'ID': range(len(docs)),
                          'Topic': topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for topic coherence evaluation
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Define CoherenceModel parameters
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_npmi')

# Calculate topic coherence
coherence = coherence_model.get_coherence()

coherence

### Word Embedding-based Inverted Rank-Biased Overlap (WE-IRBO)

https://radimrehurek.com/gensim/models/fasttext.html

In [None]:
topic_list = len(topic_model.get_topic_info().index) - 1

word_list = []

for topic in topic_list:
        words = [word for word, _ in topic_model.get_topic(topic)][:10]
        word_list.append(words)

In [None]:
# Load fastText embeddings
# PS: note that this can take a while and use a lot of memory
wv = load_facebook_vectors('cc.en.300.bin')

In [None]:
# Calculate topic diversity with WE-IRBO
word_embedding_irbo(word_list, wv, weight=0.9, topk=10)

## Defining an Alternative Frequency for topics_over_time

In [None]:
# Define alternative topics_over_time where frequency is calculated based on upvotes
def topics_over_time_upvotes(self,
                             docs: List[str],
                             topics: List[int],
                             timestamps: Union[List[str],
                                               List[int]],
                             upvotes: List[int],
                             nr_bins: int = None,
                             datetime_format: str = None,
                             evolution_tuning: bool = True,
                             global_tuning: bool = True) -> pd.DataFrame:
    """ Create topics over time

        To create the topics over time, BERTopic needs to be already fitted once.
        From the fitted models, the c-TF-IDF representations are calculate at
        each timestamp t. Then, the c-TF-IDF representations at timestamp t are
        averaged with the global c-TF-IDF representations in order to fine-tune the
        local representations.

    NOTE:
        Make sure to use a limited number of unique timestamps (<100) as the
        c-TF-IDF representation will be calculated at each single unique timestamp.
        Having a large number of unique timestamps can take some time to be calculated.
        Moreover, there aren't many use-cased where you would like to see the difference
        in topic representations over more than 100 different timestamps.

    Arguments:
        docs: The documents you used when calling either `fit` or `fit_transform`
        topics: The topics that were returned when calling either `fit` or `fit_transform`
        timestamps: The timestamp of each document. This can be either a list of strings or ints.
                    If it is a list of strings, then the datetime format will be automatically
                    inferred. If it is a list of ints, then the documents will be ordered by
                    ascending order.
        nr_bins: The number of bins you want to create for the timestamps. The left interval will
                 be chosen as the timestamp. An additional column will be created with the
                 entire interval.
        datetime_format: The datetime format of the timestamps if they are strings, eg “%d/%m/%Y”.
                         Set this to None if you want to have it automatically detect the format.
                         See strftime documentation for more information on choices:
                         https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
        evolution_tuning: Fine-tune each topic representation at timestamp t by averaging its
                          c-TF-IDF matrix with the c-TF-IDF matrix at timestamp t-1. This creates
                          evolutionary topic representations.
        global_tuning: Fine-tune each topic representation at timestamp t by averaging its c-TF-IDF matrix
                   with the global c-TF-IDF matrix. Turn this off if you want to prevent words in
                   topic representations that could not be found in the documents at timestamp t.

    Returns:
        topics_over_time: A dataframe that contains the topic, words, and frequency of topic
                              at timestamp t.

    Usage:

    The timestamps variable represent the timestamp of each document. If you have over
    100 unique timestamps, it is advised to bin the timestamps as shown below:

    ```python
    from bertopic import BERTopic
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(docs)
    topics_over_time = topic_model.topics_over_time(docs, topics, timestamps, nr_bins=20)
    ```
    """
    check_is_fitted(self)
    check_documents_type(docs)
    documents = pd.DataFrame({"Document": docs, "Topic": topics, "Timestamps": timestamps, "Upvotes": upvotes})
    global_c_tf_idf = normalize(self.c_tf_idf, axis=1, norm='l1', copy=False)

    all_topics = sorted(list(documents.Topic.unique()))
    all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}

    if isinstance(timestamps[0], str):
        infer_datetime_format = True if not datetime_format else False
        documents["Timestamps"] = pd.to_datetime(documents["Timestamps"],
                                                 infer_datetime_format=infer_datetime_format,
                                                 format=datetime_format)

    if nr_bins:
        documents["Bins"] = pd.cut(documents.Timestamps, bins=nr_bins)
        documents["Timestamps"] = documents.apply(lambda row: row.Bins.left, 1)

    # Sort documents in chronological order
    documents = documents.sort_values("Timestamps")
    timestamps = documents.Timestamps.unique()
    if len(timestamps) > 100:
        warnings.warn(f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) "
                      "which significantly slows down the application. Consider setting `nr_bins` "
                      "to a value lower than 100 to speed up calculation. ")

    # For each unique timestamp, create topic representations
    topics_over_time = []
    for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose):

        # Calculate c-TF-IDF representation for a specific timestamp
        selection = documents.loc[documents.Timestamps == timestamp, :]
        documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join,
                                                                                "Timestamps": "count",
                                                                                "Upvotes": np.sum})
        c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)

        if global_tuning or evolution_tuning:
            c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False)

        # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF
        # matrix at timestamp t-1
        if evolution_tuning and index != 0:
            current_topics = sorted(list(documents_per_topic.Topic.values))
            overlapping_topics = sorted(list(set(previous_topics).intersection(set(current_topics))))

            current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics]
            previous_overlap_idx = [previous_topics.index(topic) for topic in overlapping_topics]

            c_tf_idf.tolil()[current_overlap_idx] = ((c_tf_idf[current_overlap_idx] +
                                                          previous_c_tf_idf[previous_overlap_idx]) / 2.0).tolil()

        # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation
        # by simply taking the average of the two
        if global_tuning:
            selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]
            c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0

        # Extract the words per topic
        labels = sorted(list(documents_per_topic.Topic.unique()))
        words_per_topic = self._extract_words_per_topic(words, c_tf_idf, labels)
        topic_frequency = pd.Series(documents_per_topic.Upvotes.values,
                                    index=documents_per_topic.Topic).to_dict()

        # Fill dataframe with results
        topics_at_timestamp = [(topic,
                                ", ".join([words[0] for words in values][:6]),
                                topic_frequency[topic],
                                timestamp) for topic, values in words_per_topic.items()]
        topics_over_time.extend(topics_at_timestamp)

        if evolution_tuning:
            previous_topics = sorted(list(documents_per_topic.Topic.values))
            previous_c_tf_idf = c_tf_idf.copy()

    return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"])

topic_model.topics_over_time_upvotes = MethodType(topics_over_time_upvotes, topic_model)

## Visualization

### Topics from 2020-2021

In [None]:
# Extract embeddings for each document
embeddings = topic_model._extract_embeddings(docs, method='document')

# Dimensionality reduction to 2D
umap_model_2d = UMAP(n_neighbors=28, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings)

# Combine the data
plot_df = pd.DataFrame(umap_model_2d.embedding_, columns=['x', 'y'])
plot_df['topic'] = topics

# Select the data
top_n = 20
plot_df = plot_df.loc[plot_df.topic < top_n]

# Plot 2D embeddings
fig, ax = plt.subplots(figsize=(21,15))

# Plot outliers (-1)
ax.scatter(
    plot_df.loc[plot_df['topic'] == -1, 'x'],
    plot_df.loc[plot_df['topic'] == -1, 'y'],
    c='whitesmoke',
    alpha=0.5
)

# Plot inliers (0 to top_n-1)
scatter = ax.scatter(
    plot_df.loc[plot_df['topic'] != -1, 'x'],
    plot_df.loc[plot_df['topic'] != -1, 'y'],
    c=plot_df.loc[plot_df['topic'] != -1, 'topic'], 
    alpha=0.5, 
    cmap='tab20b'
)

# Generate colored legend
topic_labels = topic_model.get_topic_info()['Name'].to_list()[1:top_n+1]
handles, _ = scatter.legend_elements(num=None, alpha=1)
legend1 = ax.legend(handles, topic_labels, ncol=1, loc='upper left', title="Tópicos")

plt.show()

In [None]:
# Generate intertopic distance map for all topics
# vis_distmap_all = topic_model.visualize_topics(topics=topics)

In [None]:
# vis_distmap_all.write_html('./vis_distmap_all.html')

In [None]:
# Generate dendrogram for all topics
vis_hierarchy_all = topic_model.visualize_hierarchy()

In [None]:
vis_hierarchy_all.write_html('./vis_hierarchy_all.html')

In [None]:
# Generate barchart for observed pandemic-related topics
vis_barchart = topic_model.visualize_barchart(topics=[29, 37, 64, 90, 100, 106, 123, 127, 139, 141, 156, 159, 160], n_words=9)

In [None]:
vis_barchart.write_html('./vis_barchart.html')

### Topics from 2020

In [None]:
# Generate intertopic distance map for 2020 topics
# vis_distmap_y1 = topic_model.visualize_topics(topics=y1_topics)

In [None]:
# vis_distmap_y1.write_html('./vis_distmap_y1.html')

In [None]:
# Generate dendrogram for 2020 topics
# vis_hierarchy_y1 = topic_model.visualize_hierarchy(topics=y1_topics)

In [None]:
# vis_hierarchy_y1.write_html('./vis_hierarchy_y1.html')

In [None]:
y1_topics_over_time_upvotes = topic_model.topics_over_time_upvotes(docs=y1_docs,
                                                                   topics=y1_topics,
                                                                   timestamps=y1_timestamps,
                                                                   upvotes=y1_upvotes,
                                                                   global_tuning=False,
                                                                   evolution_tuning=True,
                                                                   nr_bins=12)

topics_over_time_upvotes_y1 = topic_model.visualize_topics_over_time(y1_topics_over_time_upvotes, top_n_topics=None)

In [None]:
topics_over_time_upvotes_y1.write_html('./vis_topics_over_time_upvotes_y1.html')

In [None]:
topic_model.get_representative_docs(topic=5)

### Topics from 2021

In [None]:
# Generate intertopic distance map for 2021 topics
# vis_distmap_y2 = topic_model.visualize_topics(topics=y2_topics)

In [None]:
# vis_distmap_y2.write_html('./vis_distmap_y2.html')

In [None]:
# Generate dendrogram for 2021 topics
# vis_hierarchy_y2 = topic_model.visualize_hierarchy(topics=y2_topics)

In [None]:
# vis_hierarchy_y2.write_html('./vis_hierarchy_y2.html')

In [None]:
y2_topics_over_time_upvotes = topic_model.topics_over_time_upvotes(docs=y2_docs,
                                                                   topics=y2_topics,
                                                                   timestamps=y2_timestamps,
                                                                   upvotes=y2_upvotes,
                                                                   global_tuning=False,
                                                                   evolution_tuning=True,
                                                                   nr_bins=12)

topics_over_time_upvotes_y2 = topic_model.visualize_topics_over_time(y2_topics_over_time_upvotes, top_n_topics=None)

In [None]:
topics_over_time_upvotes_y2.write_html('./vis_topics_over_time_upvotes_y2.html')

In [None]:
topic_model.get_representative_docs(topic=65)