# Benchmark

In [None]:
# Versions used:
#
# python       3.9.7
# fasttext     0.9.2
# matplotlib   3.5.0
# numpy        1.20.3
# pandas       1.3.5
# bertopic     0.9.4
# gensim       4.1.2
# hdbscan      0.8.27
# nltk         3.6.5
# scikit-learn 1.0.1
# tqdm         4.62.3
# umap-learn   0.5.2

import datetime as dt
import re
import requests as req
from types import MethodType
from typing import List, Union

import fasttext.util
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bertopic import BERTopic
from bertopic._utils import check_documents_type, check_is_fitted
from gensim.corpora import Dictionary
from gensim.models.fasttext import load_facebook_vectors
from gensim.models.coherencemodel import CoherenceModel
from hdbscan import HDBSCAN
from nltk import download as nltk_download
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from tqdm import tqdm
from umap import UMAP

from topic_model_diversity.diversity_metrics import irbo, word_embedding_irbo

## Dependencies

In [None]:
# Get stopwords list from Stanford CoreNLP
url = 'https://raw.githubusercontent.com/stanfordnlp/CoreNLP/main/data/edu/stanford/nlp/patterns/surface/stopwords.txt'
res = req.get(url)

# Save stopwords list to a local TXT file
file = open('./stopwords.txt', 'w')
file.write(res.text)
file.close()

# Download NLTK modules
# PS: only needed for manual testing of the tokenizer
"""nltk_download('averaged_perceptron_tagger')
nltk_download('punkt')
nltk_download('wordnet')"""

# Download fastText model
# PS: note that this will require a lot of disk space (~12GB)
fasttext.util.download_model('en', if_exists='ignore')

## Loading the Data

In [None]:
# Read CSV file containing timestamps, docs and upvotes
posts_df = pd.read_csv('./rcollege_clean_20200101-20220101_praw.csv')

# For posts_df, save timestamps, docs and upvotes to separate lists
timestamps = posts_df.created_utc.to_list()
docs = posts_df.document.to_list()
upvotes = posts_df.score.to_list()

## Defining stop_words

In [None]:
# Define a stop_words list and populate it with initial values
stop_words = ["aren", "couldn", "didn", "doesn",
              "don", "hadn", "hasn", "haven",
              "isn", "mustn", "shan", "shouldn",
              "wasn", "weren", "won", "wouldn"]

# Open the TXT file saved two steps above
file = open('./stopwords.txt','r')

# Append the words from the file to the stop_words list defined above
for line in file:
    stop_words.append(line.strip())

# Close the TXT file
file.close()

## Iterative Testing of Parameters

In [None]:
# Load fastText embeddings
# PS: note that this can take a while and use a lot of memory
wv = load_facebook_vectors('cc.en.300.bin')

# Define CountVectorizer parameters
vectorizer_model = CountVectorizer(tokenizer=None,
                                   stop_words=stop_words,
                                   ngram_range=(1,1))

In [None]:
n_neighbors_npmi = []
n_neighbors_irbo = []
n_neighbors_weirbo = []
n_neighbors_outliers = []


for i in range(21):
    print(f"Starting iteration i = {i}")
    
    # Define UMAP parameters
    umap_model = UMAP(n_neighbors=15+i,  # Default: 15
                      n_components=5,  # Default: 5
                      metric='cosine',
                      min_dist=0.0,
                      random_state=42)

    # Define HDBSCAN parameters
    hdbscan_model = HDBSCAN(min_cluster_size=10,  # Equals to min_topic_size (Default: 10)
                            metric='euclidean',
                            cluster_selection_method='eom',
                            prediction_data=True)

    # Define BERTopic parameters
    topic_model = BERTopic(language='english',
                           top_n_words=10,
                           min_topic_size=10,  # Default: 10
                           nr_topics=None,
                           verbose=True,
                           embedding_model='all-MiniLM-L6-v2',  # Default: all-MiniLM-L6-v2
                           umap_model=umap_model,
                           hdbscan_model=hdbscan_model,
                           vectorizer_model=vectorizer_model)

    # Fit/transform the model
    topics, _ = topic_model.fit_transform(docs)
    
    # Get DataFrame with topics and number of documents
    freq = topic_model.get_topic_info()
    
    # Get number of outliers
    outliers_ = freq['Count'].iloc[0]
    
    # Get number of topics
    topic_list = len(freq.index)-1

    word_list = []

    for topic in range(topic_list):
            words = [word for word, _ in topic_model.get_topic(topic)][:10]  # [::-1]
            word_list.append(words)
    
    # Calculate topic diversity with IRBO
    irbo_ = irbo(word_list, weight=0.9, topk=10)

    # Calculate topic diversity with WE-IRBO
    weirbo_ = word_embedding_irbo(word_list, wv, weight=0.9, topk=10)

    # Pre-process documents before topic coherence evaluation
    documents = pd.DataFrame({'Document': docs,
                              'ID': range(len(docs)),
                              'Topic': topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for topic coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)]
                   for topic in range(len(set(topics))-1)]

    # Calculate topic coherence
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_npmi')

    npmi_ = coherence_model.get_coherence()

    n_neighbors_npmi.append(npmi_)
    n_neighbors_irbo.append(irbo_)
    n_neighbors_weirbo.append(weirbo_)
    n_neighbors_outliers.append(outliers_)
    
    print(f"Finished iteration i = {i}", "\n")
    print("##########################", "\n")


nn_df = pd.DataFrame(zip(n_neighbors_npmi,
                         n_neighbors_irbo,
                         n_neighbors_weirbo,
                         n_neighbors_outliers),
                     columns=["npmi", "irbo", "we-irbo", "outliers"])

nn_df.to_csv('benchmark_n-neighbors.csv', header=True, index=True)

In [None]:
nn_df

In [None]:
n_components_npmi = []
n_components_irbo = []
n_components_weirbo = []
n_components_outliers = []


for i in range(21):
    print(f"Starting iteration i = {i}")
    
    # Define UMAP parameters
    umap_model = UMAP(n_neighbors=28,  # Default: 15
                      n_components=5+i,  # Default: 5
                      metric='cosine',
                      min_dist=0.0,
                      random_state=42)

    # Define HDBSCAN parameters
    hdbscan_model = HDBSCAN(min_cluster_size=10,  # Equals to min_topic_size (Default: 10)
                            metric='euclidean',
                            cluster_selection_method='eom',
                            prediction_data=True)

    # Define BERTopic parameters
    topic_model = BERTopic(language='english',
                           top_n_words=10,
                           min_topic_size=10,  # Default: 10
                           nr_topics=None,
                           verbose=True,
                           embedding_model='all-MiniLM-L6-v2',  # Default: all-MiniLM-L6-v2
                           umap_model=umap_model,
                           hdbscan_model=hdbscan_model,
                           vectorizer_model=vectorizer_model)

    # Fit/transform the model
    topics, _ = topic_model.fit_transform(docs)

    # Get DataFrame with topics and number of documents
    freq = topic_model.get_topic_info()
    
    # Get number of outliers
    outliers_ = freq['Count'].iloc[0]
    
    # Get number of topics
    topic_list = len(freq.index)-1

    word_list = []

    for topic in range(topic_list):
            words = [word for word, _ in topic_model.get_topic(topic)][:10]  # [::-1]
            word_list.append(words)
    
    # Calculate topic diversity with IRBO
    irbo_ = irbo(word_list, weight=0.9, topk=10)

    # Calculate topic diversity with WE-IRBO
    weirbo_ = word_embedding_irbo(word_list, wv, weight=0.9, topk=10)

    # Pre-process documents before topic coherence evaluation
    documents = pd.DataFrame({'Document': docs,
                              'ID': range(len(docs)),
                              'Topic': topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for topic coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)]
                   for topic in range(len(set(topics))-1)]

    # Calculate topic coherence
    coherence_model = CoherenceModel(topics=topic_words, 
                                    texts=tokens, 
                                    corpus=corpus,
                                    dictionary=dictionary, 
                                    coherence='c_npmi')

    npmi_ = coherence_model.get_coherence()

    n_components_npmi.append(npmi_)
    n_components_irbo.append(irbo_)
    n_components_weirbo.append(weirbo_)
    n_components_outliers.append(outliers_)
    
    print(f"Finished iteration i = {i}", "\n")
    print("##########################", "\n")


nc_df = pd.DataFrame(zip(n_components_npmi,
                         n_components_irbo,
                         n_components_weirbo,
                         n_components_outliers),
                     columns=["npmi", "irbo", "we-irbo", "outliers"])

nc_df.to_csv('benchmark_n-components.csv', header=True, index=True)

In [None]:
nc_df

In [None]:
min_cluster_npmi = []
min_cluster_irbo = []
min_cluster_weirbo = []
min_cluster_outliers = []


for i in range(21):
    print(f"Starting iteration i = {i}")
    
    # Define UMAP parameters
    umap_model = UMAP(n_neighbors=28,  # Default: 15
                      n_components=12,  # Default: 5
                      metric='cosine',
                      min_dist=0.0,
                      random_state=42)

    # Define HDBSCAN parameters
    hdbscan_model = HDBSCAN(min_cluster_size=10+i,  # Equals to min_topic_size (Default: 10)
                            metric='euclidean',
                            cluster_selection_method='eom',
                            prediction_data=True)

    # Define BERTopic parameters
    topic_model = BERTopic(language='english',
                           top_n_words=10,
                           min_topic_size=10+i,  # Default: 10
                           nr_topics=None,
                           verbose=True,
                           embedding_model='all-MiniLM-L6-v2',  # Default: all-MiniLM-L6-v2
                           umap_model=umap_model,
                           hdbscan_model=hdbscan_model,
                           vectorizer_model=vectorizer_model)

    # Fit/transform the model
    topics, _ = topic_model.fit_transform(docs)

    # Get DataFrame with topics and number of documents
    freq = topic_model.get_topic_info()
    
    # Get number of outliers
    outliers_ = freq['Count'].iloc[0]
    
    # Get number of topics
    topic_list = len(freq.index)-1

    word_list = []

    for topic in range(topic_list):
            words = [word for word, _ in topic_model.get_topic(topic)][:10]  # [::-1]
            word_list.append(words)
    
    # Calculate topic diversity with IRBO
    irbo_ = irbo(word_list, weight=0.9, topk=10)

    # Calculate topic diversity with WE-IRBO
    weirbo_ = word_embedding_irbo(word_list, wv, weight=0.9, topk=10)

    # Pre-process documents before topic coherence evaluation
    documents = pd.DataFrame({'Document': docs,
                              'ID': range(len(docs)),
                              'Topic': topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for topic coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)]
                   for topic in range(len(set(topics))-1)]

    # Calculate topic coherence
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_npmi')

    npmi_ = coherence_model.get_coherence()

    min_cluster_npmi.append(npmi_)
    min_cluster_irbo.append(irbo_)
    min_cluster_weirbo.append(weirbo_)
    min_cluster_outliers.append(outliers_)
    
    print(f"Finished iteration i = {i}", "\n")
    print("##########################", "\n")


mc_df = pd.DataFrame(zip(min_cluster_npmi,
                         min_cluster_irbo,
                         min_cluster_weirbo,
                         min_cluster_outliers),
                     columns=["npmi", "irbo", "we-irbo", "outliers"])

mc_df.to_csv('benchmark_min-cluster.csv', header=True, index=True)

In [None]:
mc_df

## Manual Testing of Parameters

### Defining UMAP, HDBSCAN and CountVectorizer

https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html

https://umap-learn.readthedocs.io/en/latest/parameters.html  
https://umap-learn.readthedocs.io/en/latest/reproducibility.html

https://hdbscan.readthedocs.io/en/latest/parameter_selection.html

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
# Define UMAP parameters
umap_model = UMAP(n_neighbors=28,  # Default: 15
                  n_components=12,  # Default: 5
                  metric='cosine',
                  min_dist=0.0,
                  random_state=42)

# Define HDBSCAN parameters
hdbscan_model = HDBSCAN(min_cluster_size=17,  # Equals to min_topic_size (Default: 10)
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True)

# Tokenizer with a lemmatization step (for testing)
# It uses the tag list from Penn Treebank Project:
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def penn2morphy(penntag):
    morphy_tag = {'NN': 'n',  # Noun
                  'JJ': 'a',  # Adjective
                  'VB': 'v',  # Verb
                  'RB': 'r'}  # Adverb

    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        _pos_tag = pos_tag(word_tokenize(doc))
        # Matches from 2 to more characters
        _token_pattern = re.compile(r'(?u)\b\w{2,}\b')
        return [self.wnl.lemmatize(t, pos=penn2morphy(tag))
                for t, tag in _pos_tag
                if re.match(_token_pattern, t)
               ]

# Define CountVectorizer parameters
vectorizer_model = CountVectorizer(tokenizer=LemmaTokenizer(),  # Default: None
                                   stop_words=stop_words,
                                   ngram_range=(1,1))

### Training the model

https://maartengr.github.io/BERTopic/api/bertopic.html

In [None]:
# Define BERTopic parameters
topic_model = BERTopic(language='english',
                       top_n_words=10,
                       min_topic_size=17,  # Default: 10
                       nr_topics=None,
                       verbose=True,
                       embedding_model='all-MiniLM-L6-v2',  # Default: all-MiniLM-L6-v2
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model)

# Fit/transform the model
topics, _ = topic_model.fit_transform(docs)