# Helper Functions

## Imports and Setup

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Customize BERTopic

In [64]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from bertopic import BERTopic

def run_custom_BERTopic(docs=[], 
                        embedding_model=SentenceTransformer("all-MiniLM-L6-v2"), 
                        vectorizer_model=CountVectorizer(ngram_range=(1, 2), stop_words=list(stopwords.words('english'))),
                        min_cluster_size=10, min_samples=10,
                       ):

    # n_neighbors identifies the kth nearest neighbors (Default is 15, recommended between 3 and 5)
    # n_components represents the reduced dimension space we embed the data into (Default is 2)
    # min_dist controls the minimum distance points are allowed to be in the final low dimensional representation (Default set to 0.1)
    
    #setting a random state allows us to fully reproduce the results each time we run the model. prevents stochastic behavior. (https://umap-learn.readthedocs.io/en/latest/reproducibility.html)
    umap_model = UMAP(n_neighbors=3, n_components=2, min_dist=0.1, random_state=42)
    
    # min_cluster_size is the minimum # of points required for a cluster. It is set to 5/10 by default, and has a min. value of 2.
    # min_samples is the minimum # of points required to form a core within a cluster. It is set to min_cluster_size by default, and has a min. value of 1.
    #      Core points are data points that have at least min_samples neighbors within a specified radius.
    #      In other words, min_samples influences how densely points must be distributed within a cluster
    
    # Summary: min_cluster_size filters out small clusters based on the number of samples they contain, while min_samples controls the density of clusters by specifying the minimum number of neighbors required for a point to be considered a core point. 
    
    # allow_single_cluster is set to true to see if a single overriding cluster exists.
    # The gen_min_span_tree and prediction_data params are required for integrating with BERTopic and visualizing clusters later on.
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples,
                            allow_single_cluster=True,
                            gen_min_span_tree=True,
                            prediction_data=True)
    
    
    model = BERTopic(
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        top_n_words=5,
        language='english',
        calculate_probabilities=True,
        # verbose=True , uncomment to see progress as the model runs
    )
    
    return model

## Evaluation Metrics

Gathers the following 5 evaluation metrics:

(1) **Coherence (Normalized Pointwise Mutual Information - NPMI)**: (how semantically similar the top words of a topic are)

The coherence score of a topic is the average NPMI score for all pairs of words within that topic. The NPMI score for a given pair of words is the joint probability that the two words appear together in the documents divided by the individual probabilities that each word appears in the documents.

$$PMI = log\frac{P(w_{i}, w_{j})}{P(w_{i})P(w_{j})}$$
$$NPMI = \frac{PMI(w_{i}, w_{j})}{-log P(w_{i}, w_{j})}$$

Interpretation:

The normalization adjusts this measurement to give a score between -1 and 1. Higher scores mean the words are more likely to appear together, indicating a more coherent topic.

In [82]:
# Coherence - NPMI
# coherence score of < 0 is very low, indicative that the words across the topics are similar semantically (but this makes sense since all from the same survey?) 

import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

from bertopic import BERTopic
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def coherence_score(docs, model, vectorizer_model):
    topics = model.get_topics()

    # EXAMPLE ONE WITH DATA PROCESSING
    cleaned_docs = model._preprocess_text(docs)
    analyzer = vectorizer_model.build_analyzer()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    topic_words = [
       [word for word, probs in model.get_topic(topic) if word != ""] for topic in topics
    ]
    topic_words = [[words for words, _ in model.get_topic(topic)] 
                for topic in range(len(set(topics))-1)]
    
    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_npmi')
    
    return coherence_model.get_coherence()

(2) **Sihouette Score**:

The silhouette score measures how similar a word is to its own topic compared to other topics. Given the sentence embeddings for each document in a BERT model, you can calculate:

$$a(i) \text{: the avg. distance from one document to all other documents in its assigned topic}$$
$$b(i) \text{: the distance from one document to the nearest other topic}$$

$$score = \frac{b(i) - a(i)}{max(a(i),b(i)}$$

Interpretation: 

Scores range from -1 to 1. A score close to 1 means the object is well-matched to its own cluster and poorly matched to neighboring clusters. If most documents have high silhouette scores (close to 1), it means they are well-grouped into distinct topics. If scores are close to -1, the document is misclassified.

In [72]:
# Silhouette Score
# Measures how similar an object is to its own cluster compared to other clusters.
# Ranges from -1 to 1, where higher values indicate better-defined clusters.
# A score of 1 indicates that the object is well-matched to its own cluster and poorly matched to neighboring clusters.
from sklearn.metrics import silhouette_score


def silhouette_metric(docs, embedding_model, model, topics):
    embeddings = embedding_model.encode(docs, show_progress_bar=False)
    
    # Generate `X` and `labels` only for non-outlier topics (as they are technically not clusters)
    umap_embeddings = model.umap_model.transform(embeddings)
    indices = [index for index, topic in enumerate(topics) if topic != -1]
    X = umap_embeddings[np.array(indices)]
    labels = [topic for index, topic in enumerate(topics) if topic != -1]
    
    # Calculate silhouette score
    return silhouette_score(X, labels)

(3) **Adjusted Rand Index (ARI)**: (similarity of clusterings between true and predicted topics)


Given the set of true_labels (e.g. determined by factor analysis) and predicted_labels (determined by BERTopic), we can calculate how similar the topic/factor assignments are. First, we derive a confusion matrix (true positives, true negatives, ...). Then we run the following:

$$RI = \frac{TP + TN}{TP + TN + FP + FN}$$ 
Adjusted Rand Index further corrects for chance.

Interpretation: 

A score of 1 indicates perfect agreement between the two labelings, while a score of 0 indicates random labeling, and a score less than 0 indicates disagreement.

(4) **Purity**: (quality: how well each topic contains documents from a single true topic)

Purity measures the extent to which the documents are exclusively assigned to a single topic.
Given the set of true_labels (e.g. determined by factor analysis) and predicted_labels (determined by BERTopic), we can calculate for each predicted topic the proportion of the topic's documents belonging to the most frequent true label (topic) in that cluster.

For each topic with n documents:
$$ purity = \frac{\text{number of documents that belong to most frequent true label topic}}{n}  $$

Interpretation:

A purity score of 1 indicates that all documents in the cluster belong to the same true class, making the cluster perfectly homogeneous.
Lower purity scores indicate that the cluster contains a mix of different true classes, reducing its homogeneity.

(5) **Normalized Mutual Information (NMI)**: (interdependence between true and predicted labels)

TBD

Interpretation:

A normalized mutual information of 1 indicates perfect agreement between the clusterings, while a score of 0 indicates no agreement beyond chance.

In [80]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Purity
def purity_score(y_true, y_pred):

    confusion_matrix = np.zeros((np.max(y_true) + 1, np.max(y_pred) + 1), dtype=int)
    
    for true_label, pred_label in zip(y_true, y_pred):
        confusion_matrix[true_label][pred_label] += 1
    
    return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix)


# Pass in two lists: true_factors and predicted_factors
def evaluation_metrics(true_values, predicted_values):
    # Convert categorical data to numerical labels
    le_true = LabelEncoder()
    le_pred = LabelEncoder()

    true_labels = le_true.fit_transform(true_values)
    predicted_labels = le_pred.fit_transform(predicted_values)

    # Adjusted Rand Index
    ari = adjusted_rand_score(true_labels, predicted_labels)

    # Purity
    purity = purity_score(true_labels, predicted_labels)
    
    # Normalized Mutual Information
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    
    return ari, purity, nmi