<a href="https://colab.research.google.com/github/KinzaaSheikh/lm_research_notes/blob/main/LM_Pipeline_to_Extract_Finance_ArXiv_Papers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qU umap-learn bertopic hdbscan sentence-transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# Step 1: Load ArXiv Data and Extract Finance-related Papers

In [2]:
from datasets import load_dataset

# Load ArXiv data
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

README.md:   0%|          | 0.00/617 [00:00<?, ?B/s]

data.csv:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
# Filter finance-related abstracts (quick keyword search)
finance_keywords = ["finance", "financial", "economics", "market", "investment", "risk", "trading"]

finance_data = [
    (title, abstract)
    for title, abstract in zip(dataset["Titles"], dataset["Abstracts"])
    if any(kw.lower() in (title + " " + abstract).lower() for kw in finance_keywords)
]

titles, abstracts = zip(*finance_data)

# Step 2: Embed Documents

In [4]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

# Step 3: Dimensionality Reduction & Clustering

In [13]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_components=10, min_dist=0.1, metric="cosine", random_state=42)
reduced_embeddings_5d = umap_model.fit_transform(embeddings)

hdbscan_model = HDBSCAN(min_cluster_size=10, metric="euclidean", cluster_selection_method="eom")
clusters = hdbscan_model.fit_predict(reduced_embeddings_5d)


# Step 4: Topic Modeling with BERTopic

In [14]:
from bertopic import BERTopic

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
).fit(abstracts, embeddings)

topic_info = topic_model.get_topic_info()
topic_info.head()

2025-09-04 16:38:48,227 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-04 16:38:54,785 - BERTopic - Dimensionality - Completed ✓
2025-09-04 16:38:54,787 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-04 16:38:54,833 - BERTopic - Cluster - Completed ✓
2025-09-04 16:38:54,838 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-04 16:38:55,090 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,492,-1_the_and_of_to,"[the, and, of, to, in, we, for, that, on, is]",[ Converting text into the structured query l...
1,0,186,0_financial_the_sentiment_of,"[financial, the, sentiment, of, and, in, to, a...",[ Financial sentiment analysis is a challengi...
2,1,94,1_translation_the_decoding_mbr,"[translation, the, decoding, mbr, of, nmt, to,...",[ Despite the progress in machine translation...
3,2,59,2_the_to_and_of,"[the, to, and, of, in, llms, chatgpt, we, ques...",[ Large Language Models (LLMs) are widely use...
4,3,56,3_clinical_notes_patient_of,"[clinical, notes, patient, of, and, the, to, p...",[ Objective: Social determinants of health (S...


# Step 5: Exploration

In [15]:
topic_model.find_topics("financial markets")

([0, 12, -1, 11, 19],
 [np.float32(0.8633349),
  np.float32(0.83803725),
  np.float32(0.8312428),
  np.float32(0.82729554),
  np.float32(0.8271398)])

In [16]:
topic_model.find_topics("risk management")

([3, -1, 12, 19, 0],
 [np.float32(0.8321667),
  np.float32(0.829653),
  np.float32(0.82549137),
  np.float32(0.8244915),
  np.float32(0.8221344)])

In [17]:
topic_model.find_topics("cryptocurrency")

([0, -1, 11, 17, 20],
 [np.float32(0.8117271),
  np.float32(0.8088007),
  np.float32(0.79419255),
  np.float32(0.79263014),
  np.float32(0.7879696)])

In [18]:
# inspect

topic_model.get_topic(5)

[('depression', np.float64(0.04299295682458926)),
 ('mental', np.float64(0.03169507087380934)),
 ('the', np.float64(0.030844617500248483)),
 ('of', np.float64(0.029297856867399892)),
 ('and', np.float64(0.0261174974048203)),
 ('to', np.float64(0.024579173481721096)),
 ('media', np.float64(0.023574579057489293)),
 ('social', np.float64(0.02351439452219616)),
 ('on', np.float64(0.022621889680981347)),
 ('we', np.float64(0.022306491597761557))]

# Step 6: Visualization

In [19]:
fig = topic_model.visualize_topics()
fig.show()

In [20]:
fig2 = topic_model.visualize_barchart(top_n_topics=15)
fig2.show()