In [1]:
from pprint import pprint

import delta_sharing
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN

In [2]:
# get data
profile_file = "config.share"
client = delta_sharing.SharingClient(profile_file)
print("Available tables:")
pprint(client.list_all_tables())

# load metadata table
table_url = profile_file + "#share__products.silver.amazon_metadata_silver_selected"
df = delta_sharing.load_as_pandas(table_url)

Available tables:
[Table(name='sentiment', share='share__products', schema='gold'),
 Table(name='amazon_reviews_selected', share='share__products', schema='silver'),
 Table(name='intention', share='share__products', schema='gold'),
 Table(name='amazon_metadata_silver_selected', share='share__products', schema='silver'),
 Table(name='amazon_reviews_silver', share='share__products', schema='silver'),
 Table(name='amazon_metadata_silver', share='share__products', schema='silver')]


In [3]:
print("DataFrame shape:", df.shape)

DataFrame shape: (91572, 15)


In [4]:
# discard empty strings
df = df[df["title"].map(len) > 0]
print("DataFrame shape:", df.shape)

DataFrame shape: (91562, 15)


In [5]:
product_titles = df["title"].tolist()
product_titles[:5]

["Instructor's Resource CD-ROM for The Art of Speaking",
 "Como te va? Middle school Spanish Level A Teacher's edition.",
 'Medal of Honor: Warfighter - Includes Battlefield 4 Beta - Limited Edition',
 'Applied Calculus Lecture Series VHS',
 "Magruder's American Government Itext Interactive Textbook on Cd-rom Texas Edition"]

In [6]:
import torch
from InstructorEmbedding import INSTRUCTOR

device = "cuda" if torch.cuda.is_available() else "cpu"
# model = INSTRUCTOR("hkunlp/instructor-large", device=device)
# model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
model = SentenceTransformer("thenlper/gte-base", device=device)
# model = SentenceTransformer("thenlper/gte-large", device=device)

No sentence-transformers model found with name /home/studio-lab-user/.cache/torch/sentence_transformers/thenlper_gte-base. Creating a new one with MEAN pooling.


In [7]:
# instruction = "Represent the Amazon product title from clustering: "
# sentences = [[instruction, pt] for pt in product_titles]
sentences = product_titles

In [8]:
%%time

# instructor-large -> 9min
# gte-base -> 2min
# gte-large -> 5min
embeddings = model.encode(sentences)
print("Embeddings shape:", embeddings.shape)
print("Max :", embeddings.max())
print("Min :", embeddings.min())
print("Mean:", embeddings.mean())
print("Std :", embeddings.std())

Embeddings shape: (91562, 768)
Max : 2.5369804
Min : -4.5617976
Mean: -0.011828765
Std : 0.59265023
CPU times: user 1min 21s, sys: 27.2 s, total: 1min 48s
Wall time: 1min 41s


In [9]:
import pickle

model_name = "gte_base"
# Store sentences & embeddings on disc
with open(f"embeddings_product_titles_{model_name}.pkl", "wb") as fOut:
    pickle.dump(
        {"product_titles": sentences, "asin": df["asin"].tolist(), "embeddings": embeddings},
        fOut,
        protocol=pickle.HIGHEST_PROTOCOL,
    )

# Load sentences & embeddings from disc
with open(f"embeddings_product_titles_{model_name}.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_ids = stored_data["asin"]
    stored_sentences = stored_data["product_titles"]
    stored_embeddings = stored_data["embeddings"]

In [134]:
%%time
from sklearn.cluster import HDBSCAN

db = DBSCAN(eps=0.005, min_samples=2, metric="cosine").fit(embeddings)

CPU times: user 4min 5s, sys: 40.4 s, total: 4min 46s
Wall time: 1min 34s


In [9]:
labels = db.labels_
# Number of clusters in labels, ignoring noise if present
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(f"Estimated number of clusters: {n_clusters_}")
print(f"Estimated number of noise points: {n_noise_}")

NameError: name 'db' is not defined

In [142]:
df["label"] = labels

In [143]:
top_clusters = df["label"].value_counts().to_frame()
top_clusters.head(10)

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
-1,40805
3607,2725
82,2094
17,533
6283,488
3535,423
42,315
29,302
1262,223
3006,213


In [165]:
idx = 21
cluster_idx = top_clusters.index[idx]

print(f"CLUSTER INDEX: {cluster_idx}, CLUSTER_SIZE: {top_clusters.iloc[idx].item()}")
print()

for c, i in enumerate(df.groupby(df["label"]).get_group(cluster_idx)["title"]):
    if c >= 100:
        break
    print(i)

CLUSTER INDEX: 221, CLUSTER_SIZE: 99

Need For Speed III: Hot Pursuit
Need for Speed 3: Hot Pursuit
Need for Speed 3:  Hot Pursuit - PC
Need for Speed Underground 2
Need for Speed: Hot Pursuit [GameCube]
Need For Speed Hot Pursuit 2 (UK)
Need for Speed: Hot Pursuit 2 - PC
Need for Speed: Hot Pursuit 2
Need for Speed: Hot Pursuit 2
Need for Speed Underground 2
NEED FOR SPEED III - HOT PURSUIT
Need for Speed Underground - PC
Need for Speed Underground - Xbox
Need For Speed Underground
Need for Speed: Hot Pursuit 2
Need For Speed: Underground 2 - PC
Need for Speed: Underground 2 - Xbox
Need for Speed Underground 2 - PlayStation 2
Need for Speed: Underground 2 (UK)
Need for Speed Most Wanted - Xbox
Need for Speed: Most Wanted (PS2)
The World of Need for Speed - PC
Need for Speed: Underground 2 (UK)
Need for Speed Most Wanted
Need for Speed Carbon: Own the City - Nintendo DS
Need for Speed Carbon - Gamecube
Need for Speed Carbon: Own the City Greatest Hits -Sony PSP
Need for Speed Carbon Co

## **Semantic search**

In [137]:
import torch
from sentence_transformers.util import semantic_search

query_text = "cctv"
query_embedding = model.encode(query_text, convert_to_tensor=True)
corpus_embeddings = torch.from_numpy(embeddings)

In [138]:
query_embedding = query_embedding.to(device)
corpus_embeddings = corpus_embeddings.to(device)

NameError: name 'device' is not defined

In [139]:
results = semantic_search(query_embedding, corpus_embeddings, top_k=10)
results

[[{'corpus_id': 19069, 'score': 0.8483919501304626},
  {'corpus_id': 78452, 'score': 0.8329084515571594},
  {'corpus_id': 15082, 'score': 0.8299873471260071},
  {'corpus_id': 13485, 'score': 0.8295143842697144},
  {'corpus_id': 1397, 'score': 0.8291088938713074},
  {'corpus_id': 76003, 'score': 0.8286737203598022},
  {'corpus_id': 41726, 'score': 0.8279640078544617},
  {'corpus_id': 8154, 'score': 0.8279042840003967},
  {'corpus_id': 55624, 'score': 0.8279042840003967},
  {'corpus_id': 70914, 'score': 0.8271548748016357}]]

In [133]:
for result in results[0]:
    id = result["corpus_id"]
    title = df["title"].iloc[id]
    print(title)

Swann IP-3G ConnectCam 500 Wireless Network Internet Security Surveillance Video Camera
TriVision NC-227WF HD 720P Wifi Wireless IP Network Camera Home Security Camera, Wired, Motion Sensor, IR Night Vision, Dvr Micro Sd Card, Plug and Play Apps on iPhone, Android, PC, Mac
Video Patrol 5.0 By Honest Technology
Xbox 360 Live Vision Camera
ScreenCam
Eyeline Camera (3 line) Video Surveillance Software [Download]
IP camera monitoring software [Download]
COSMI Police Chase ( Windows )
COSMI Police Chase ( Windows )
HDE EasyCap Model 002 - 4 Channel USB 2.0 DVR Video Audio CCTV Capture Adapter
