<a href="https://colab.research.google.com/github/KinzaaSheikh/lm_research_notes/blob/main/LM_Pipeline_to_Extract_Finance_ArXiv_Papers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU umap-learn bertopic hdbscan sentence-transformers

# Step 1: Load ArXiv Data and Extract Finance-related Papers

In [None]:
from datasets import load_dataset

# Load ArXiv data
dataset = load_dataset("maartengr/arxiv_nlp")["train"]

In [None]:
# Filter finance-related abstracts (quick keyword search)
finance_keywords = ["finance", "financial", "economics", "market", "investment", "risk", "trading"]

finance_data = [
    (title, abstract)
    for title, abstract in zip(dataset["Titles"], dataset["Abstracts"])
    if any(kw.lower() in (title + " " + abstract).lower() for kw in finance_keywords)
]

titles, abstracts = zip(*finance_data)

# Step 2: Embed Documents

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

# Step 3: Dimensionality Reduction & Clustering

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_components=10, min_dist=0.1, metric="cosine", random_state=42)
reduced_embeddings_5d = umap_model.fit_transform(embeddings)

hdbscan_model = HDBSCAN(min_cluster_size=10, metric="euclidean", cluster_selection_method="eom")
clusters = hdbscan_model.fit_predict(reduced_embeddings_5d)


# Step 4: Topic Modeling with BERTopic

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
).fit(abstracts, embeddings)

topic_info = topic_model.get_topic_info()
topic_info.head()

# Step 5: Exploration

In [None]:
topic_model.find_topics("financial markets")

In [None]:
topic_model.find_topics("risk management")

In [None]:
topic_model.find_topics("cryptocurrency")

In [None]:
# inspect

topic_model.get_topic(5)

# Step 6: Visualization

In [None]:
fig = topic_model.visualize_topics()
fig.show()

In [None]:
fig2 = topic_model.visualize_barchart(top_n_topics=15)
fig2.show()