In [None]:
# !pip install umap-learn
# !pip install hdbscan
# !pip install seaborn
# !pip install bertopic
# !pip install nbformat
# !pip install 'bertopic[spacy]'

In [1]:
from datasets import load_dataset

dataset = load_dataset('maartengr/arxiv_nlp')['train']


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Extract specific metadata
abstracts = dataset["Abstracts"]
years = dataset["Years"]
categories = dataset["Categories"]
titles = dataset["Titles"]

A typic cluster pipeline is 

1. Embeddocuments
2. Reducedimensionality 
3. Clusterembeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(abstracts, show_progress_bar=True)

Before we cluster the embeddings we generated from the ArXiv abstracts, we need to take care of the curse of dimensionality first. This curse is a phenomenon that occurs when dealing with high-dimensional data. As the number of dimensions increases, there is an exponential growth of the number of possible values within each dimension. Finding all subspaces within each dimension becomes increasingly complex. Moreover, as the number of dimensions grows, the concept of distance between points becomes increasingly less precise.

In [None]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine')

reduced_embeddings = umap_model.fit_transform(embeddings)

In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=15, min_samples=1, metric='euclidean', cluster_selection_method='eom')
hdbscan_model.fit(reduced_embeddings)
labels = hdbscan_model.labels_

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np

reduced_embeddings = UMAP(n_neighbors=15, n_components=2, metric='cosine').fit_transform(embeddings)
df = pd.DataFrame(np.hstack((reduced_embeddings, labels.reshape(-1, 1))), columns=["x", "y", "cluster"])
df.cluster = df.cluster.astype(int).astype(str)
sns.scatterplot(data=df, x='x', y='y', hue='cluster', linewidth=0, legend=False, s=3, alpha=0.3)


In [None]:
for index in np.where(labels==1)[0][:3]:
    print(abstracts[index])

Topic Models 
Latent Dirichlet Allocation (LDA; blei2003latent) is a classical and popular approach to topic modeling that assumes that each topic is characterized by a probability distribution over words in a corpus vocabulary. Each document is to be considered a mixture of topics

A bit morden way is BERTopic which is a pipeline like follows
Sentence BERT -> UMAP -> HDSCAN -> CountVector -> c-IDF-TF 


In [4]:
from bertopic import BERTopic

topic_model = BERTopic()
topics, probabilities = topic_model.fit_transform(documents=abstracts)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

The standard bertopic implementation

In [None]:
topic_model.visualize_documents(docs=titles, topics=[0, 1, 2, 3, 4, 6, 7, 10, 12,13, 16, 33, 40, 45, 46, 65])

In [None]:
topic_model.get_topic_info(topics[titles.index("Linguistic Information Energy")])
abstracts[titles.index("Linguistic Information Energy")]

In [5]:
from copy import deepcopy

original_topics = deepcopy(topic_model.topic_representations_)

In [13]:
def topic_differences(topic_model, original_topics, max_length=60):
    topic_number = len(original_topics) - 1
    for topic in range(topic_number):
        og_words = " | ".join(list(zip(*original_topics[topic][:5]))[0])
        new_words = " | ".join(list(zip(*topic_model.topic_representations_[topic][:5]))[0])
        space = " " * max(0, (max_length - len(og_words)))
        print(f"Topic {topic}: {og_words}{space} -> {new_words}")

In [None]:
from bertopic.representation import KeyBERTInspired

key_bert_inspired_model = KeyBERTInspired()
key_bert_model = BERTopic(representation_model=key_bert_inspired_model)
key_bert_model.fit_transform(documents=abstracts)


In [None]:
topic_differences(key_bert_model, original_topics)

In [None]:
topic_differences(key_bert_model, original_topics)

In [None]:
from bertopic.representation._pos import PartOfSpeech
pos = PartOfSpeech("en_core_web_sm")

key_bert_model.update_topics(abstracts, representation_model=pos)
topic_differences(key_bert_model, original_topics)

In [None]:
# Maximal Marginal Relevance
from bertopic.representation import MaximalMarginalRelevance

mmr_model = MaximalMarginalRelevance(diversity=0.5)
topic_model.update_topics(abstracts, representation_model=mmr_model)

topic_differences(topic_model, original_topics)

In [11]:
prompt = """
I have topic that contains the following documents: \n[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the above information, can you give a short label of the topic?
"""

In [15]:
from transformers import pipeline
from bertopic.representation import TextGeneration

text_generator = pipeline("text2text-generation", model="google/flan-t5-base")
representation_model = TextGeneration(text_generator, prompt=prompt)

topic_model.update_topics(docs=abstracts, representation_model=representation_model)

topic_differences(topic_model, original_topics)


Token indices sequence length is longer than the specified maximum sequence length for this model (1216 > 512). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 

Topic 0: speech | asr | recognition | acoustic | endtoend             -> ASR and TTS for reinforcement learning |  |  |  | 
Topic 1: hate | offensive | detection | speech | toxic                -> Automatic Hate Speech Detection on Social Media |  |  |  | 
Topic 2: summarization | summaries | summary | abstractive | extractive -> Summarization without ground-truth summaries |  |  |  | 
Topic 3: prompt | fewshot | prompts | incontext | tuning              -> CP-Tuning |  |  |  | 
Topic 4: ner | named | entity | nested | recognition                  -> Named Entity Recognition |  |  |  | 
Topic 5: clinical | medical | notes | patient | patients              -> Attention-Based Deep Learning for Clinical Progress Notes |  |  |  | 
Topic 6: word | embeddings | embedding | vectors | similarity         -> Word embeddings |  |  |  | 
Topic 7: bias | gender | biases | debiasing | fairness                -> Gender bias in natural language processing |  |  |  | 
Topic 8: parsing | dependency | pa