## **Product Saturation**

In [30]:
import pickle
from pathlib import Path

import delta_sharing
from sklearn.cluster import DBSCAN

In [15]:
# get data
profile_file = "config.share"
client = delta_sharing.SharingClient(profile_file)
table_url = f"{profile_file}#share__products.silver.amazon_metadata_silver_selected"

df = delta_sharing.load_as_pandas(table_url)

# Load from Databricks
# df = spark.table("products.silver.amazon_reviews_selected")
# df = df.toPandas()

# discard empty strings
df = df[df["title"].map(len) > 0]
print("DataFrame shape:", df.shape)

DataFrame shape: (91562, 15)


In [12]:
# TODO: Load the embeddings in databricks
model_name = "gte_base"
emb_dir = Path("")
# Load sentences & embeddings from disc
with open(emb_dir / f"embeddings_product_titles_{model_name}.pkl", "rb") as fIn:
    data = pickle.load(fIn)
    asin = data["asin"]
    product_titles = data["product_titles"]
    embeddings = data["embeddings"]

In [36]:
%%time

db = DBSCAN(eps=0.05, min_samples=2, metric="cosine").fit(embeddings)

CPU times: user 4min 10s, sys: 42.4 s, total: 4min 52s
Wall time: 1min 38s


In [37]:
labels = db.labels_
# Number of clusters in labels, ignoring noise if present
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(f"Estimated number of clusters: {n_clusters_}")
print(f"Estimated number of noise points: {n_noise_}")

Estimated number of clusters: 10845
Estimated number of noise points: 40805


In [38]:
df["label"] = labels

In [153]:
label_counts = df["label"].value_counts()
label_counts.head(10)

label
-1       40805
 3607     2725
 82       2094
 17        533
 6283      488
 3535      423
 42        315
 29        302
 1262      223
 3006      213
Name: count, dtype: int64

In [43]:
idx = 2
cluster_idx = label_counts.index[idx]

print(f"CLUSTER INDEX: {cluster_idx}, CLUSTER_SIZE: {label_counts.iloc[idx].item()}")
print()

for c, i in enumerate(df.groupby(df["label"]).get_group(cluster_idx)["title"]):
    if c >= 100:
        break
    print(i)

CLUSTER INDEX: 6283, CLUSTER_SIZE: 488

Naruto Xbox 360 Skin Set - Console with 2 Controllers
MODFREAKZ  Console and Controller Vinyl Skin Set - Flaming Blue Skull for Playstation 4
ModFreakz&reg; Console/Controller Vinyl Skin Set - Battlefield 4 War for PS4 Original
MODFREAKZ  Console and Controller Vinyl Skin Set - Stealth Gun Fight Watchdog for Playstation 4
Mod Freakz PS4 Console and Controller Vinyl Skin Decal COD Ghosts
Mod Freakz Console and Controller Vinyl Skin Set - Assassins Creed Rev for Playstation 4
ModFreakz  Console/Controller Vinyl Skin Set - God of War Kratos for Xbox One Original
Mod Freakz Console and Controller Vinyl Skin Set - Fighting Ninja Girl Mirror's Edge for Playstation 4
Mod Freakz Console and Controller Vinyl Skin Set - Dragon Scales for Playstation 4
Mod Freakz Console and Controller Vinyl Skin Set - Brown Snake Scales Anaconda for Playstation 4
Mod Freakz Console and Controller Vinyl Skin Set - Superheroes for Playstation 4
ModFreakz&reg; Console/Control

In [154]:
label_counts = label_counts.reset_index()
label_counts["ranking"] = label_counts.index

label_counts

Unnamed: 0,label,count,ranking
0,-1,40805,0
1,3607,2725,1
2,82,2094,2
3,17,533,3
4,6283,488,4
...,...,...,...
10841,10813,2,10841
10842,10814,2,10842
10843,10815,2,10843
10844,10816,2,10844


In [163]:
max_saturation = label_counts[label_counts.index != -1].max()


def discrete_saturation(product):
    count = product["count"]
    ranking = product["ranking"]
    # not ranked
    if product["label"] == -1:
        return "Low", -1
    if product["label"] == -1 or count <= 5:
        return "Low", ranking
    elif 50 >= count > 5:
        return "Medium", ranking
    elif count > 50:
        return "High", ranking
    else:
        return "Extremely High", ranking

In [164]:
label_counts.apply(discrete_saturation, axis=1)

0           (Low, -1)
1           (High, 1)
2           (High, 2)
3           (High, 3)
4           (High, 4)
             ...     
10841    (Low, 10841)
10842    (Low, 10842)
10843    (Low, 10843)
10844    (Low, 10844)
10845    (Low, 10845)
Length: 10846, dtype: object