In [None]:
import polars as pl
import numpy as np
import pathlib

DATA_DIR = pathlib.Path("../data")
SAMPLE_SIZE = 10_000
SEED = 2349479

In [None]:
lf = pl.scan_parquet(DATA_DIR / "processed/yelp_reviews_with_embeddings.parquet")
lf.head().collect()

In [None]:
# Sampling
from typing import Optional

def sample_from_lf(
    lf: pl.LazyFrame,
    n: int,
    seed: Optional[int] = None,
    replace: bool = False
) -> pl.LazyFrame:
    rng = np.random.default_rng(seed)
    lf_len = lf.select("index").count().collect().item()
    all_possible_rows = np.arange(lf_len)
    sample_idxs = rng.choice(all_possible_rows, size=n, replace=replace)
    return lf.filter(pl.col("index").is_in(sample_idxs))

sampled_lf = sample_from_lf(lf, n=SAMPLE_SIZE, seed=SEED)
sampled_lf.head().collect()

In [None]:
df = (
    sampled_lf
    .drop([
        # Removing because these columns follow weird distributions
        # and don't seem necessarily that helpful
        "user_review_count",
        "business_review_count",
        # Ignoring state for now to avoid
        # making the data too dimensional
        "state"
    ]) 
    .collect()
)
df.head()

In [None]:
def min_max_scaler(col: str):
    x = pl.col(col)
    return (x - x.min()) / (x.max() - x.min())

metadata_cols = [
    "date",
    "stars",
    "user_average_stars",
    "yelping_since",
    "business_stars"
]

scaling_expressions = [
    min_max_scaler(c)
    for c in metadata_cols
]

df = df.with_columns(scaling_expressions)
df

In [None]:
metadata_df = df.select(metadata_cols)
metadata_df

In [None]:
from mvlearn.cluster import MultiviewSpectralClustering

N_CLUSTERS = 20

embeddings = df["embedding"].to_numpy()
metadata = metadata_df.to_numpy()
assert len(embeddings) == len(metadata)
Xs = [embeddings, metadata]

mvc = MultiviewSpectralClustering(
    n_clusters=N_CLUSTERS,
    random_state=SEED,
    n_init=10
)

cluster_labels = mvc.fit_predict(Xs)
df_results = df.with_columns(
    pl.Series(name="topics", values=cluster_labels)
)
df_results.head()

In [None]:
df_results["topics"].null_count()

In [None]:
any(cluster_labels == None)

In [None]:
mvc.labels_

In [None]:
from sklearn.base import ClusterMixin, BaseEstimator
from typing import Optional

class MVCWrapper(BaseEstimator, ClusterMixin):
    metadata: np.ndarray
    labels_: Optional[np.ndarray]

    def __init__(self, model, metadata: np.ndarray):
        self.model = model
        self.metadata = metadata
        self.labels_ = None

    def fit(self, X):
        if not len(X) == len(self.metadata):
            raise ValueError(
                f"Metadata and textual embeddings must have the same length. Found {len(X) and len(self.metadata)}"
            )
        # Joins textual embeddings and metadata
        # to prepare for Multi-View Clustering
        Xs = [X, self.metadata]

        self.model.fit(Xs)
        self.labels_ = self.model.labels_
        return self

    def predict(self, X):
        if not len(X) == len(self.metadata):
            raise ValueError(
                f"Metadata and textual embeddings must have the same length. Found {len(X) and len(self.metadata)}"
            )
        Xs = [X, self.metadata]
        return self.model.predict(Xs)

In [None]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

vanilla_bertopic = BERTopic(
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine"),
    hdbscan_model=HDBSCAN(min_cluster_size=15, prediction_data=True),
    vectorizer_model=CountVectorizer(stop_words="english"),
    ctfidf_model=ClassTfidfTransformer()
)

In [None]:
vanilla_topics, vanilla_probs = vanilla_bertopic.fit_transform(
    df["text"].to_list(), embeddings=embeddings
)

In [None]:
vanilla_bertopic.get_topic_info()

In [None]:
mvc = MultiviewSpectralClustering(
    n_clusters=len(vanilla_bertopic.get_topic_info()),
    random_state=SEED,
    n_init=10
)

mvc_wrapper = MVCWrapper(mvc, metadata=metadata)

modded_bertopic = BERTopic(
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine"),
    hdbscan_model=mvc_wrapper,
    vectorizer_model=CountVectorizer(stop_words="english"),
    ctfidf_model=ClassTfidfTransformer()
)

In [None]:
modded_topics, modded_probs = modded_bertopic.fit_transform(
    df["text"].to_list(), embeddings=embeddings
)

In [None]:
modded_bertopic.get_topic_info()

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

def coherence(m: BERTopic, docs: list[str], topics: list[int], coherence: str = "c_v") -> float:
    """
    Computes coherence for topic model.
    Code taken from https://github.com/MaartenGr/BERTopic/issues/90
    """
    # Gets the same vectorizer instance used in the model
    vectorizer = m.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    tokens = [tokenizer(doc) for doc in docs]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    n_topics = len(set(topics))
    topic_words = [
        [word for word, _ in m.get_topic(topic)] # type: ignore
        for topic in range(n_topics - 1) # Ignores noise topic number -1
    ]

    cm = CoherenceModel(
        topics=topic_words,
        texts=tokens,
        dictionary=dictionary,
        corpus=corpus,
        coherence=coherence
    )
    return cm.get_coherence()

In [None]:
import itertools
import numpy as np
import rbo


def compute_bertopic_irbo(m: BERTopic, topk: int = 10, p: float = 0.9) -> float:
    all_topics = m.get_topics()
    
    # Extracts topic words
    topic_words = []
    for topic_id, topic_list in all_topics.items():
        # Skips outlier topic -1
        if topic_id == -1:
            continue
        # Ignores score
        words = [word for word, score in topic_list]
        topic_words.append(words)
    return compute_irbo(topic_words, topk=topk, p=p)


def compute_irbo(topics: list[list[str]], topk: int = 10, p: float = 0.9) -> float:
    """
    Calculates Inverted Rank-Biased Overlap (IRBO) for a list of topics.
    
    Args:
        topics (list of list of str): A list where each element is a list of words (the topic).
        topk (int): How many top words to consider from each topic.
        p (float): The "p" parameter for RBO (usually 0.9). 
                   Higher p puts more weight on lower-ranked words.
    
    Returns:
        float: The IRBO score (0.0 to 1.0). 
               0.0 means topics are identical (bad).
               1.0 means topics are completely different (good).
    """
    # 1. Truncate topics to top-k words
    t_lists = [t[:topk] for t in topics]
    
    # 2. Generate all unique pairs of topics
    pairs = list(itertools.combinations(t_lists, 2))
    
    if not pairs:
        return 0.0
    
    # 3. Calculate RBO for each pair
    rbo_scores = []
    for t1, t2 in pairs:
        # Extrapolated RBO used
        score = rbo.RankingSimilarity(t1, t2).rbo_ext(p=p)
        rbo_scores.append(score)

    # 4. Average the RBO scores
    avg_rbo = np.mean(rbo_scores)
    
    # 5. Invert to get IRBO (Diversity)
    # 1 means diverse (good), 0 means redundant (bad)
    return 1.0 - avg_rbo

In [None]:
comparison = [
    {
        "model": "Vanilla",
        "coherence": coherence(vanilla_bertopic, df["text"].to_list(), vanilla_topics),
        "exclusivity": compute_bertopic_irbo(vanilla_bertopic)
    },
    {
        "model": "Multi-View",
        "coherence": coherence(modded_bertopic, df["text"].to_list(), modded_topics),
        "exclusivity": compute_bertopic_irbo(modded_bertopic)
    }
]
pl.DataFrame(comparison)