In [None]:
from dotenv import dotenv_values
import polars as pl
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary


WORKERS = 4
NUM_TOPICS = 5
CONFIG = dotenv_values("../.env")
FILE = f"../{CONFIG["DATA_DIR"]}/communications_preprocessed.csv"

In [None]:
df = pl.read_csv(FILE)

texts = [s.split() for s in df["stemmed_text"]]
gensim_dict = Dictionary(documents=texts)
corpus = [gensim_dict.doc2bow(t) for t in texts]

In [None]:
lda = LdaMulticore(corpus, num_topics=NUM_TOPICS, workers=WORKERS, id2word=gensim_dict)

In [None]:
import gensim.models.basemodel
import numpy as np


def exclusivity(model: gensim.models.basemodel.BaseTopicModel, top_words=10, exclusivity_weight=0.7) -> float:
    # Small error to avoid division by zero
    e = 1e-10
    
    word_probs: np.ndarray = model.get_topics() # The probability of word j being in topic i
    # Normalize across columns
    # Document this part a bit better. Why am I doing this?
    col_sums = word_probs.sum(axis=0)
    exclusivity_matrix = word_probs / col_sums # col_sums[np.newaxis, :] might be more general

    # Now this part is complicated
    # We have to create the Empirical Cumumlative Distribution Function (ECDF) for x
    # In the R code, they do this through
    # `ex <- apply(mat, 2, rank) / nrow(mat)`
    # where `mat` is our `exclusivity_matrix`.
    #
    # The rank function seems to do something similar to the `np.argsort` function.
    # For example, the following code
    #
    # ```r
    # x <- c(20, 30, 10, 4, 65)
    # rank(x)
    # ```
    # returns
    # `[1] 3 4 2 1 5`
    #
    # So now `ranked_x[i]` gives us the position that the element `x[i]` should have to sort the array.
    # I.e., "the i-th element would be in the ranked_x[i]-th position in a sorted vector".
    #
    # Conversely, the following Python code
    # ```python
    # x = np.array([20, 30, 10, 4, 65])
    # np.argsort(x) + 1 # Adding one to match the R output, which is 1-indexed
    # ```
    # returns
    # `[4 3 1 2 5]`
    #
    # Which is not the same thing. In this case, `ranked_x[i]` gives us the index of the element
    # that would have to occupy `i` in a sorted array. I.e., "if I want to create a sorted array,
    # I have to go back to `x`, pick its i-th element, and put it in this position".
    #
    # Running `np.argsort` once more seems to fix this. I'm not sure why. See:
    # ```python
    # x = np.array([20, 30, 10, 4, 65])
    # sorted_x = np.argsort(x)
    # np.argsort(sorted_x) + 1 # Adding one to match the R output, which is 1-indexed
    # ```
    # This outputs:
    # `[3 4 2 1 5]`
    # which is equal to the R output.
    vocab_size = word_probs.shape[1]
    num_topics = word_probs.shape[0]

    ex = np.argsort(np.argsort(exclusivity_matrix, axis=0), axis=0) / vocab_size
    fr = np.argsort(np.argsort(word_probs, axis=0), axis=0) / vocab_size

    frex = 1.0 / (exclusivity_weight / (ex + e) + (1 - exclusivity_weight) / (fr + e))
    return frex
exclusivity(lda)

In [None]:
x = np.array([20, 30, 10, 4, 65])
sorted_x = np.argsort(x)
print("1st sort: ", sorted_x + 1)
doubly_sorted_x = np.argsort(sorted_x)
print("2nd sort: ", doubly_sorted_x + 1)