# Semantic Clustering
In this notebook we'll see how we can use pre-trained language models to embed text into fixed length vectors. The models we will use have been trained to produce embedding vectors that are close by if the input text has a similar semantic content! This can be useful for comparing chunks of text and performing vector based search!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.notebook import trange, tqdm
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from torchtext.datasets import AG_NEWS, IMDB

# Make sure you are using the lastest version!
os.environ["TOKENIZERS_PARALLELISM"] = "false"

torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
# Approx number of text samples to use
num_data_points = 10000

# Define the batch size for mini-batch gradient descent
batch_size = 64

# https://www.kaggle.com/datasets/ltcmdrdata/plain-text-wikipedia-202011
# Define the root directory of the dataset
data_set_root = "../../datasets"

In [None]:
# Check if GPU is available, set device accordingly
device = torch.device(1 if torch.cuda.is_available() else 'cpu')

## Data processing and Tokenization

In [None]:
# Subtract 1 from all labels to make them 0 and 1 (not 1 and 2...)
# make everything lowercase
def process_data(x):
    return x[0] - 1, x[1].lower(), 

dataset_train = IMDB(root=data_set_root, split="train")
dataset_test = IMDB(root=data_set_root, split="test")
    
dataset_train = dataset_train.map(process_data)
dataset_test = dataset_test.map(process_data)

# IMDB does not seem to be properly shuffled....
dataset_train = dataset_train.shuffle(buffer_size=10000).set_shuffle(True)
dataset_test = dataset_test.shuffle(buffer_size=10000).set_shuffle(True)

# This is a hack to get around some random bug with the IMDB dataset not properly
# Processing the positive (pos) datapoints, you only need to do this once...
# This will take a few seconds..
for label, text in dataset_train:
    continue
    
for label, text in dataset_test:
    continue

In [None]:
# Create data loaders for the training and testing datasets
data_loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=8)
data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, num_workers=8)

## Create embedding model and tokenizer

In [None]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5').to(device)
_ = model.eval()

## Extract embeddings

In [None]:
embeddings_log = []
labels_log = []
text_log = []

# Loop over each batch in the training dataset
for label, text in tqdm(data_loader_train, desc="Extracting", leave=False, total=num_data_points//batch_size):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)

    with torch.cuda.amp.autocast():
        with torch.no_grad():
            embeddings = model(**encoded_input)[0][:, 0]
            
            norm_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
            
            embeddings_log.append(norm_embeddings.cpu())
            labels_log.append(label)
            text_log += list(text)
            
    if len(labels_log) * batch_size >= num_data_points:
        break

In [None]:
np_embeddings = torch.cat(embeddings_log).numpy()
np_labels = torch.cat(labels_log).numpy()

## Cluster the Embedddings!

In [None]:
n_clusters = 2  # You can adjust this number based on your data
kmeans = KMeans(n_clusters=n_clusters)
cluster_labels = kmeans.fit_predict(np_embeddings)
cluster_centers = kmeans.cluster_centers_

## Perform dimension reduction for visualization

In [None]:
# PCA Dim reduction
pca = PCA(n_components=20)

# Stack embs and centers to project together
combined_embs = np.vstack([np_embeddings, cluster_centers])
combined_pca = pca.fit_transform(combined_embs)

In [None]:
# TSNE Dim reduction (Maintain local distances)
tsne = TSNE(n_components=2, perplexity=50)
combined_2d = tsne.fit_transform(combined_pca)

# Separate the projected embeddings and centers
embeddings_2d = combined_2d[:-n_clusters]
centers_2d = combined_2d[-n_clusters:]

In [None]:
# Create a scatter plot
plt.figure(figsize=(8, 5))

cluster_0_index = np.where(cluster_labels == 0)[0]
scatter1 = plt.scatter(embeddings_2d[cluster_0_index, 0], embeddings_2d[cluster_0_index, 1], 
                      c=np_labels[cluster_0_index], s=5, marker="_")

cluster_1_index = np.where(cluster_labels == 1)[0]
scatter2 = plt.scatter(embeddings_2d[cluster_1_index, 0], embeddings_2d[cluster_1_index, 1], 
                      c=np_labels[cluster_1_index], s=5, marker="o")

_ = plt.scatter(centers_2d[:, 0], centers_2d[:, 1], c="r", s=100, marker="x")
_ = plt.legend(["cluster 0", "cluster 1"])
_ = plt.xlabel('t-SNE feature 1')
_ = plt.ylabel('t-SNE feature 2')
_ = plt.title('t-SNE visualization of embeddings with Semantic label')

## Find outlier reviews

In [None]:
# Get the indices of all points in cluster 0
cluster_0_indices = np.where(cluster_labels == 0)[0]

# Get the labels and embeddings of points in cluster 0
cluster_0_labels = np_labels[cluster_0_indices]
cluster_0_points = np_embeddings[cluster_0_indices]

# Find the most common semantic label for this cluster
cluster_0_median_label = np.median(cluster_0_labels)
print("The most common semantic label is %d" % cluster_0_median_label)

In [None]:
_ = plt.hist(cluster_0_labels)

In [None]:
# Get the cluster indices of points within this cluster that do not have the typical semantic label
outlier_indices = np.where(~(cluster_0_labels == cluster_0_median_label))[0]

# Get the origional indices for these points (to index text list)
cluster_0_outlier_indices = cluster_0_indices[outlier_indices]

# Get the embeddings of the outliers
cluster_0_outlier_points = cluster_0_points[outlier_indices]
cluster_0_outlier_labels = cluster_0_labels[outlier_indices]

## Find the worst outliers

In [None]:
# # Find the distance between each outlier embedding and the cluster center for cluster 0
points_diff = (cluster_0_outlier_points - cluster_centers[0].reshape(1, -1))
points_dist = np.mean(np.power(points_diff, 2), 1)

In [None]:
# Find the outlier that is closest to the cluster center
# AKA the "worst" outlier
closest_5 = np.argsort(points_dist)[:5]
closest_5_indices = cluster_0_outlier_indices[closest_5]

In [None]:
closest_5_indices

In [None]:
# Get the origional text for this outlier
outlier_text = text_log[closest_5_indices[0]]
outlier_label = np_labels[closest_5_indices[0]]

In [None]:
outlier_text