# Notebook to Embed and Cluster Text Segments

## Install and Import Dependencies

In [11]:
# Uncomment the line below to install the dependencies
# !pip install numpy pandas scikit-learn torch sentence-transformers

In [1]:
import numpy as np
import pandas as pd
import sklearn.cluster as cl
import torch
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


## Load Data

In [None]:
# Path to csv file
data_path = "/Users/naman/Workspace/Projects/SoS_longLDocument_sum/processed_chapters.csv"
# Read csv
df = pd.read_csv(data_path)
df.tail()

Unnamed: 0,index,bid,segment
97832,645,151,Turned from the bridegroom's door.
97833,646,151,"He went like one that hath been stunned,"
97834,647,151,And is of sense forlorn:
97835,648,151,"A sadder and a wiser man,"
97836,649,151,He rose the morrow morn.


In [3]:
# Extract unique bids
unique_bids = df["bid"].unique().tolist()
unique_bids

[28054,
 2833,
 1232,
 1200,
 44747,
 14328,
 174,
 5658,
 107,
 161,
 1130,
 1929,
 12915,
 23042,
 1756,
 23046,
 151]

In [None]:
# Extract segments for bids in a dictionary
segments = {}
for bid in unique_bids:
  filt = df["bid"] == bid
  segments[bid] = df[filt]["segment"].map(lambda x: x.strip()).tolist()
segments

{28054: ['PART I',
  'Book I. The History Of A Family Chapter I.',
  'Fyodor Pavlovitch Karamazov',
  'Alexey Fyodorovitch Karamazov was the third son of Fyodor Pavlovitch Karamazov, a land owner well known in our district in his own day, and still remembered among us owing to his gloomy and tragic death, which happened thirteen years ago, and which I shall describe in its proper place.',
  'For the present I will only say that this "landowner"--for so we used to call him, although he hardly spent a day of his life on his own estate--was a strange type, yet one pretty frequently to be met with, a type abject and vicious and at the same time senseless.',
  'But he was one of those senseless persons who are very well capable of looking after their worldly affairs, and, apparently, after nothing else.',
  "Fyodor Pavlovitch, for instance, began with next to nothing; his estate was of the smallest; he ran to dine at other men's tables, and fastened on them as a toady, yet at his death it a

## Load Model to Create Embeddings

In [None]:
checkpoint = "sentence-transformers/all-mpnet-base-v2"
# Model automatically loads into device it finds
model = SentenceTransformer(checkpoint)
model

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
bid = unique_bids[0]
# Create embeddings for segments
embeddings = model.encode(segments[161])
embeddings

array([[-0.00478033, -0.02029388,  0.00584763, ...,  0.04312989,
        -0.03145018, -0.00741248],
       [ 0.02732488,  0.11214639,  0.00625042, ...,  0.04068683,
         0.01262393, -0.01656588],
       [ 0.02252577,  0.03719389, -0.00093298, ...,  0.02461454,
        -0.03751082,  0.02144923],
       ...,
       [ 0.0004255 ,  0.03545054, -0.00167793, ...,  0.06442916,
         0.03359721, -0.00607684],
       [ 0.01056288,  0.02956885, -0.0452613 , ...,  0.07359503,
         0.01833136, -0.01689027],
       [ 0.02997803, -0.00551742,  0.0056318 , ...,  0.01776354,
        -0.00153098, -0.06054033]], dtype=float32)

## Cluster Embeddings

We use the DBSCAN algorithm below to cluster embeddings.
Number of clusters is automatically calculated by the algorithm.

Non-negative labels indicate a unique cluster, and the label `-1` indicates an outlier.

In [None]:
dbscan = cl.DBSCAN(eps=0.5, min_samples=2)
# Fit dbscan on embeddings
labels = dbscan.fit_predict(embeddings)
labels

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


array([-1, -1, -1, ..., -1, -1, -1])

In [8]:
# Number of clusters formed
unique_clusters = np.unique(labels)
n_clusters = unique_clusters.shape[0] - (1 if -1 in labels else 0)
n_clusters, unique_clusters

(8, array([-1,  0,  1,  2,  3,  4,  5,  6,  7]))

## Crate a Map from Text to Embedding and Vice-Versa

In [None]:
class Mapping:
  """
  Creates a bidirectional mapping between text and embeddings.

  :param list[str] texts: List of texts
  :param torch.Tensor embeddings: Matrix of embeddings
  """

  def __init__(
    self,
    texts: list[str],
    embeddings: torch.Tensor
  ) -> None:
    # Use text as key
    self.text_to_embedding = dict(zip(texts, embeddings))
    # Use tuple of embeddings as key
    self.embedding_to_text = dict(
      (tuple(i.item() for i in emb), text)
      for emb, text in zip(embeddings, texts)
    )
  
  def __getitem__(
    self,
    key: str | torch.Tensor
  ) -> torch.Tensor | str:
    """
    Get embedding for text or vice-versa
    
    :param str | torch.Tensor key: Text or embedding
    :return value (torch.Tensor | str): Embedding or text corresponding to key
    """
    return self.text_to_embedding[key] if isinstance(key, str) \
      else self.embedding_to_text[tuple(i.item() for i in key)]

In [10]:
mapping = Mapping(segments[161], torch.tensor(embeddings))
mapping[segments[161][0]], mapping[embeddings[0]]

(tensor([-4.7803e-03, -2.0294e-02,  5.8476e-03,  4.7191e-02, -3.8355e-02,
          1.4991e-02,  1.7415e-02,  2.4708e-02, -6.3267e-03,  2.6007e-04,
          1.8083e-03,  1.5359e-03,  3.0398e-02, -1.1601e-01, -2.4168e-02,
         -4.5770e-02, -2.4849e-02,  8.1297e-04, -3.1510e-02,  1.6382e-02,
          9.5177e-03, -7.7603e-03,  4.0749e-02, -3.1377e-02,  2.5908e-02,
         -9.8258e-03,  1.8969e-02,  1.8587e-02,  6.7014e-02,  4.6408e-02,
          1.7233e-02, -9.9839e-03, -1.3331e-02,  2.0268e-03,  1.8711e-06,
          2.1829e-02,  3.1364e-02, -1.8659e-02, -1.6592e-02, -3.1946e-02,
          8.9574e-02,  1.1205e-01,  5.3235e-03, -1.7309e-02, -1.4010e-02,
         -2.6091e-02, -4.6339e-02, -1.3954e-02, -3.7125e-03,  1.2371e-02,
          3.3091e-02, -5.4004e-02, -6.2936e-02,  1.7513e-02,  2.4895e-02,
          8.9744e-03, -1.5953e-03, -2.8627e-03,  7.1273e-03,  5.7443e-02,
          4.8599e-03, -1.8291e-02, -2.5362e-02, -2.2606e-02,  1.9402e-02,
         -8.0209e-03,  4.8474e-02,  2.