In [0]:
from pprint import pprint

import delta_sharing
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN

In [0]:
# get data
profile_file = "config.share"
client = delta_sharing.SharingClient(profile_file)
print("Available tables:")
pprint(client.list_all_tables())

# load metadata table
table_url = profile_file + "#share__products.silver.amazon_metadata_silver_selected"
df = delta_sharing.load_as_pandas(table_url)

In [0]:
print("DataFrame shape:", df.shape)

In [0]:
# discard empty strings
df = df[df["title"].map(len) > 0]
print("DataFrame shape:", df.shape)

In [0]:
product_titles = df["title"].tolist()
product_titles[:5]

In [0]:
import torch
from InstructorEmbedding import INSTRUCTOR

device = "cuda" if torch.cuda.is_available() else "cpu"
# model = INSTRUCTOR("hkunlp/instructor-large", device=device)
# model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
model = SentenceTransformer("thenlper/gte-base", device=device)
# model = SentenceTransformer("thenlper/gte-large", device=device)

In [0]:
# instruction = "Represent the Amazon product title from clustering: "
# sentences = [[instruction, pt] for pt in product_titles]
sentences = product_titles

In [0]:
%%time

# instructor-large -> 9min
# gte-base -> 2min
# gte-large -> 5min
embeddings = model.encode(sentences)
print("Embeddings shape:", embeddings.shape)
print("Max :", embeddings.max())
print("Min :", embeddings.min())
print("Mean:", embeddings.mean())
print("Std :", embeddings.std())

In [0]:
import pickle

model_name = "gte_base"
# Store sentences & embeddings on disc
with open(f"embeddings_product_titles_{model_name}.pkl", "wb") as fOut:
    pickle.dump(
        {"product_titles": sentences, "asin": df["asin"].tolist(), "embeddings": embeddings},
        fOut,
        protocol=pickle.HIGHEST_PROTOCOL,
    )

# Load sentences & embeddings from disc
with open(f"embeddings_product_titles_{model_name}.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_ids = stored_data["asin"]
    stored_sentences = stored_data["product_titles"]
    stored_embeddings = stored_data["embeddings"]

In [0]:
%%time
from sklearn.cluster import HDBSCAN

db = DBSCAN(eps=0.005, min_samples=2, metric="cosine").fit(embeddings)

In [0]:
labels = db.labels_
# Number of clusters in labels, ignoring noise if present
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(f"Estimated number of clusters: {n_clusters_}")
print(f"Estimated number of noise points: {n_noise_}")

In [0]:
df["label"] = labels

In [0]:
top_clusters = df["label"].value_counts().to_frame()
top_clusters.head(10)

In [0]:
idx = 21
cluster_idx = top_clusters.index[idx]

print(f"CLUSTER INDEX: {cluster_idx}, CLUSTER_SIZE: {top_clusters.iloc[idx].item()}")
print()

for c, i in enumerate(df.groupby(df["label"]).get_group(cluster_idx)["title"]):
    if c >= 100:
        break
    print(i)

## **Semantic search**

In [0]:
import torch
from sentence_transformers.util import semantic_search

query_text = "cctv"
query_embedding = model.encode(query_text, convert_to_tensor=True)
corpus_embeddings = torch.from_numpy(embeddings)

In [0]:
query_embedding = query_embedding.to(device)
corpus_embeddings = corpus_embeddings.to(device)

In [0]:
results = semantic_search(query_embedding, corpus_embeddings, top_k=10)
results

In [0]:
for result in results[0]:
    id = result["corpus_id"]
    title = df["title"].iloc[id]
    print(title)