<a href="https://colab.research.google.com/github/Netizine/nlp/blob/main/clusering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Colab Notebook: Topic Modeling with Fine-Tuned RoBERTa and ICIS Corpus

# This notebook demonstrates how to load a private fine-tuned RoBERTa model and dataset from Hugging Face,
# generate embeddings, cluster them, and visualize topics similar to the examples from:
# - Clustering Contextual Embeddings for Topic Model (Towards Data Science)
# - SBERT Topic Modeling Example

# 1. Install dependencies
!pip install --quiet transformers datasets sentence-transformers huggingface-hub umap-learn scikit-learn plotly
!pip install --quiet \
  "cuml-cu12==25.6.0" \
  "cudf-cu12==25.6.0" \
  "dask-cudf-cu12==25.6.0" \
  "raft-dask-cu12==25.6.0" \
  "pylibraft-cu12==25.6.0" \
  "rapids-dask-dependency==25.6.0"

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build 

In [12]:
# 2. Login to Hugging Face Hub (for private repos)
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
# 3. Import libraries
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import umap
import time
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
import plotly.express as px
from huggingface_hub import snapshot_download
from datasets import load_from_disk
import glob, os, multiprocessing
import cudf
from cuml import PCA as cuPCA
from cuml import UMAP as cuUMAP

In [16]:
# 4. Load fine-tuned RoBERTa as a SentenceTransformer
model_id = "Netizine/icis"
embedder = SentenceTransformer(model_id)

# 5. Load the dataset
# 4. Download the dataset repository to a local folder
#    This saves the dataset in its Arrow format for direct loading
dataset_dir = snapshot_download(repo_id="Netizine/icis", repo_type="dataset")

# 5. Download and read the raw corpus.txt file
#    This corpus is a plain text file with one sentence per line
from huggingface_hub import hf_hub_download
corpus_file = hf_hub_download(repo_id="Netizine/icis", repo_type="dataset", filename="corpus.txt")
with open(corpus_file, 'r') as f:
    texts = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(texts)} sentences from corpus.txt")


Some weights of RobertaModel were not initialized from the model checkpoint at Netizine/icis and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded 1352657 sentences from corpus.txt


In [17]:
# 6. Generate embeddings (batch for large corpora)
# leave 2 cores free so Colab doesn’t lock up
num_cpus    = multiprocessing.cpu_count()
num_workers = max(1, num_cpus - 2)

batch_size = 128  # you can also bump this up if you’ve got the RAM
embeddings = embedder.encode(
    texts,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True,
    num_workers=num_workers
)

Batches:   0%|          | 0/10568 [00:00<?, ?it/s]

In [6]:
# 7. Dimensionality reduction with UMAP
import umap
reducer = umap.UMAP(
    n_neighbors=15,
    n_components=2,
    metric='cosine',
    n_jobs=num_workers,
    low_memory=True,
    n_epochs=20,
    verbose=True
)
print(f"🔄 Running UMAP with {num_workers} workers…")
t1 = time.time()
import numpy as np
idx = np.random.choice(len(embeddings), size=200_000, replace=False)
emb_small = embeddings[idx]
mapper = umap.UMAP(...).fit(emb_small)
emb_2d_small = mapper.embedding_
emb_2d_full  = mapper.transform(embeddings)
print(f"✅ UMAP done in {time.time()-t1:.1f}s")

NameError: name 'num_workers' is not defined

In [None]:
# 8. Clustering embeddings
n_clusters = 25  # tune as appropriate
n_jobs   = max(1, num_cpus - 2)
print(f"⚙️  Clustering with {n_jobs} parallel workers…")

# ——— set up your clusterer ———
clusterer = AgglomerativeClustering(
    n_clusters=n_clusters,
    n_jobs=n_jobs,             # parallelize merges/distances
    compute_distances=True     # (optional) so you can inspect distances later
)

# ——— time & run it ———
start = time.time()
labels = clusterer.fit_predict(embeddings)
end   = time.time()

print(f"✅ Done clustering {len(embeddings)} points into {n_clusters} groups in {end-start:.1f}s")

In [None]:
# 9. Visualization
import pandas as pd

df = pd.DataFrame({
    'x': emb_2d[:,0],
    'y': emb_2d[:,1],
    'cluster': labels,
    'text': texts
})

fig = px.scatter(
    df, x='x', y='y', color='cluster', hover_data=['text'],
    title='Topic Clusters of ICIS Commodity News'
)
fig.update_traces(marker={'size': 5})
fig.show()

In [None]:
# Vectorize texts with CountVectorizer and print top-n terms for each cluster.

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english', max_features=10000)
X_counts = vectorizer.fit_transform(texts)
terms = vectorizer.get_feature_names_out()

for cluster_id in range(n_clusters):
    idx = labels == cluster_id
    # sum term frequencies in this cluster
    freqs = X_counts[idx].sum(axis=0).A1
    top_terms = [terms[i] for i in freqs.argsort()[-10:][::-1]]
    print(f"Cluster {cluster_id}: {', '.join(top_terms)}")