## **Product Saturation**

In [0]:
import pickle
from pathlib import Path
import pandas as pd 

from sklearn.cluster import DBSCAN

In [0]:
# # get data with delta_sharing
# import delta_sharing
# profile_file = "config.share"
# client = delta_sharing.SharingClient(profile_file)
# table_url = f"{profile_file}#share__products.silver.amazon_metadata_silver_selected"
# df = delta_sharing.load_as_pandas(table_url)

# Load from Databricks
df = spark.table("products.silver.amazon_metadata_silver_selected").toPandas()
print("DataFrame shape:", df.shape)

In [0]:
emb_path = "/dbfs/mnt/datapalooza-products-reviews-raw/embeddings/embeddings_product_titles_gte_base.pkl"
# Load sentences & embeddings from disk
with open(emb_path, "rb") as f:
    data = pickle.load(f)
    asin = data["asin"]
    product_titles = data["product_titles"]
    embeddings = data["embeddings"]

In [0]:
%%time

db = DBSCAN(eps=0.05, min_samples=2, metric="cosine").fit(embeddings)

In [0]:
labels = db.labels_
# Number of clusters in labels, ignoring noise if present
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(f"Estimated number of clusters: {n_clusters_}")
print(f"Estimated number of noise points: {n_noise_}")

In [0]:
labels_df = df.merge(right=pd.DataFrame({"asin": asin, "label": labels}), on="asin", how="left")[["asin", "title", "label"]]
labels_df["label"].fillna(-1, inplace=True) 

In [0]:
labels_df

In [0]:
label_counts = labels_df["label"].value_counts()
label_counts.head(10)

In [0]:
idx = 0
cluster_idx = label_counts.index[idx]

print(f"CLUSTER INDEX: {cluster_idx}, CLUSTER_SIZE: {label_counts.iloc[idx].item()}")
print()

for c, i in enumerate(labels_df.groupby("label").get_group(cluster_idx)["title"]):
    if c >= 100:
        break
    print(i)

In [0]:
rankings_mapping = {label: rank for rank, label in enumerate(label_counts.index)}
rankings_mapping[-1] = len(label_counts)
labels_df["ranking"] = labels_df["label"].map(rankings_mapping)
labels_df

In [0]:
def discrete_saturation(product):
    label = product["label"]
    count = label_counts[label]
    ranking = product["ranking"]
    # not ranked
    if product["label"] == -1 or count <= 5:
        return "Low"
    elif 50 >= count > 5:
        return "Medium"
    elif count > 50:
        return "High"
    else:
        return "Extremely High"

labels_df["saturation_label"] = labels_df.apply(discrete_saturation, axis=1)

In [0]:
labels_df = labels_df.drop(columns=["label", "title"])


In [0]:
labels_df

In [0]:
table_name = "product_saturations"
df_spark = spark.createDataFrame(labels_df)
(
    df_spark
    .write
    .format("delta")
    .mode("overwrite")
    .option("mergeSchema", "true")
    .saveAsTable(
        f"products.gold.{table_name}",
        path = f"s3://datapalooza-products-reviews-gold/{table_name}.delta"
    )
)