In [None]:
import polars as pl
import pathlib

DATA_PATH = "../data/processed/embeddings_batches/batch_*.parquet"
DATA_DIR = pathlib.Path("../data/processed")

In [None]:
metadata_df = lf.drop(["text", "state", "embedding"]).collect()
embeddings_df = lf.select("embedding").collect()

In [None]:
import polars as pl

embed_lf = pl.scan_parquet("../data/processed/yelp_reviews_with_embeddings.parquet")
embed_lf.head().collect()

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def reshape_metadata(df: pl.DataFrame, new_range: tuple = (-1, 1)) -> np.ndarray:
    """
    Converts metadata DataFrame to numpy array.
    Scales variables as needed.
    """
    scaler = MinMaxScaler(feature_range=new_range)
    return scaler.fit_transform(df.to_numpy())

In [None]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SimpleFusionAE(nn.Module):
    def __init__(self, text_dim: int, meta_dim: int, latent_dim=50):
        super(SimpleFusionAE, self).__init__()

        input_dim = text_dim + meta_dim

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim)
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, text, metadata):
        x = torch.cat((text, metadata), dim=1)

        latent_representation = self.encoder(x)

        reconstructed_x = self.decoder(latent_representation)

        return reconstructed_x, latent_representation

In [None]:
metadata_arr = reshape_metadata(metadata_df)
embeddings_arr = embeddings_df["embedding"].to_numpy()

model = SimpleFusionAE(text_dim=embeddings_arr.shape[1], meta_dim=metadata_arr.shape[1]).to(device)
model

In [None]:
from torch.utils.data import DataLoader, TensorDataset

x_embeddings = torch.tensor(embeddings_arr, dtype=torch.float32).cuda()
x_metadata = torch.tensor(metadata_arr, dtype=torch.float32).cuda()

dataset = TensorDataset(x_embeddings, x_metadata)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
import torch.optim
from tqdm import tqdm

LEARNING_RATE = 1e-3
EPOCHS = 20

# Training the model
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()

for epoch in tqdm(range(EPOCHS), desc="Training model"):
    model.train()
    total_loss = 0

    for batch_text, batch_meta in dataloader:
        optimizer.zero_grad()

        reconstructed_x, latent_representation = model(batch_text, batch_meta)

        target = torch.cat((batch_text, batch_meta), dim=1)
        loss = criterion(reconstructed_x, target)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss / len(dataloader):.6f}")

In [None]:
model.eval()

In [None]:
# Pass fused embeddings to the CPU to be stored
# Doing this gradually as to avoid OOM

inference_dataset = TensorDataset(x_embeddings, x_metadata)
inference_loader = DataLoader(inference_dataset, batch_size=32, shuffle=False)

fused_embeddings_list = []
with torch.no_grad():
    for batch_text, batch_meta in inference_loader:
        _, batch_latent = model(batch_text, batch_meta)
        batch_latent_cpu = batch_latent.cpu().numpy()
        fused_embeddings_list.append(batch_latent_cpu)

fused_embeddings = np.vstack(fused_embeddings_list)
fused_embeddings.shape

In [None]:
pl.DataFrame({"fused_embeddings": fused_embeddings}).write_parquet("../data/processed/fused_embeddings.parquet")

In [None]:
fused_embeddings_lf = pl.scan_parquet(DATA_DIR / "fused_embeddings.parquet")
fused_embeddings_lf.count().collect()

In [None]:
lf = pl.scan_parquet(DATA_DIR / "yelp_reviews_with_embeddings.parquet")
lf.count().collect()

In [None]:
torch.save(model.state_dict(), "../models/fusion_autoencoder.weights.pth")

In [None]:
#from umap import UMAP
from cuml.manifold import UMAP
#from hdbscan import HDBSCAN
from cuml.cluster import HDBSCAN
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

vanilla_bertopic = BERTopic(
    #umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine", low_memory=True),
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine"),
    hdbscan_model=HDBSCAN(min_cluster_size=15, prediction_data=True),
    vectorizer_model=CountVectorizer(stop_words="english"),
    ctfidf_model=ClassTfidfTransformer()
)

modded_bertopic = BERTopic(
    #umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine", low_memory=True),
    umap_model=UMAP(n_components=5, min_dist=0.0, metric="cosine"),
    hdbscan_model=HDBSCAN(min_cluster_size=15, prediction_data=True),
    vectorizer_model=CountVectorizer(stop_words="english"),
    ctfidf_model=ClassTfidfTransformer()
)

In [None]:
import polars as pl

max_rows = 1_000_000

lf = pl.scan_parquet(DATA_DIR / "yelp_reviews_with_embeddings.parquet")

encoder_lf = pl.read_parquet(DATA_DIR / "fused_embeddings.parquet")
encoder_embeddings = encoder_lf.select("fused_embeddings").to_series().to_numpy()
encoder_embeddings = encoder_embeddings[:max_rows]
total_rows = encoder_embeddings.shape[0]

lf = lf.slice(0, total_rows)
docs = lf.select("text").collect().to_series().to_list()
vanilla_embeddings = lf.select("embedding").collect().to_series().to_numpy()

In [None]:
vanilla_topics, vanilla_probs = vanilla_bertopic.fit_transform(
    docs, embeddings=vanilla_embeddings
)

In [None]:
modded_topics, modded_probs = modded_bertopic.fit_transform(
    docs, embeddings=encoder_embeddings
)

In [None]:
OUTPUT_DIR = pathlib.Path("../data/output")

In [None]:
vanilla_bertopic.get_topic_info().to_csv(OUTPUT_DIR / "yelp_vanilla_topics.csv")

In [None]:
modded_bertopic.get_topic_info().to_csv(OUTPUT_DIR / "yelp_autoencoder_topics.csv")

In [None]:
vanilla_bertopic.get_document_info(docs).to_parquet(OUTPUT_DIR / "yelp_vanilla_topic_assignments.parquet")

In [None]:
modded_bertopic.get_document_info(docs).to_parquet(OUTPUT_DIR / "yelp_autoencoder_topic_assignments.parquet")

In [None]:
import itertools
import numpy as np
import rbo


def compute_bertopic_irbo(m: BERTopic, topk: int = 10, p: float = 0.9) -> float:
    all_topics = m.get_topics()
    
    # Extracts topic words
    topic_words = []
    for topic_id, topic_list in all_topics.items():
        # Skips outlier topic -1
        if topic_id == -1:
            continue
        # Ignores score
        words = [word for word, score in topic_list]
        topic_words.append(words)
    return compute_irbo(topic_words, topk=topk, p=p)


def compute_irbo(topics: list[list[str]], topk: int = 10, p: float = 0.9) -> float:
    """
    Calculates Inverted Rank-Biased Overlap (IRBO) for a list of topics.
    
    Args:
        topics (list of list of str): A list where each element is a list of words (the topic).
        topk (int): How many top words to consider from each topic.
        p (float): The "p" parameter for RBO (usually 0.9). 
                   Higher p puts more weight on lower-ranked words.
    
    Returns:
        float: The IRBO score (0.0 to 1.0). 
               0.0 means topics are identical (bad).
               1.0 means topics are completely different (good).
    """
    # 1. Truncate topics to top-k words
    t_lists = [t[:topk] for t in topics]
    
    # 2. Generate all unique pairs of topics
    pairs = list(itertools.combinations(t_lists, 2))
    
    if not pairs:
        return 0.0

    # 3. Calculate RBO for each pair
    rbo_scores = []
    for t1, t2 in pairs:
        # Extrapolated RBO used
        score = rbo.RankingSimilarity(t1, t2).rbo_ext(p=p)
        rbo_scores.append(score)

    # 4. Average the RBO scores
    avg_rbo = np.mean(rbo_scores)
    
    # 5. Invert to get IRBO (Diversity)
    # 1 means diverse (good), 0 means redundant (bad)
    return 1.0 - avg_rbo

In [None]:
compute_bertopic_irbo(vanilla_bertopic)

In [None]:
compute_bertopic_irbo(modded_bertopic)

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

def coherence(m: BERTopic, docs: list[str], topics: list[int], coherence: str = "c_v") -> float:
    """
    Computes coherence for topic model.
    Code taken from https://github.com/MaartenGr/BERTopic/issues/90
    """
    # Gets the same vectorizer instance used in the model
    vectorizer = m.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    tokens = [tokenizer(doc) for doc in docs]
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    n_topics = len(set(topics))
    topic_words = [
        [word for word, _ in m.get_topic(topic)] # type: ignore
        for topic in range(n_topics - 1) # Ignores noise topic number -1
    ]

    cm = CoherenceModel(
        topics=topic_words,
        texts=tokens,
        dictionary=dictionary,
        corpus=corpus,
        coherence=coherence
    )
    return cm.get_coherence()

In [None]:
coherence(vanilla_bertopic, docs, vanilla_topics, coherence="c_npmi")

In [None]:
coherence(modded_bertopic, docs, modded_topics, coherence="c_npmi")