In [0]:
# Define your project structure names
catalog_name = "big_data_project"
gold_schema_name = "gold"
silver_schema_name = "silver" # We also need this for visualizations

# Define the full table names
gold_table = f"{catalog_name}.{gold_schema_name}.products_with_features"
silver_table = f"{catalog_name}.{silver_schema_name}.products_cleaned"

# Load our gold data
gold_df = spark.table(gold_table)

print(f"Successfully loaded Gold table: {gold_table}")
display(gold_df.limit(5))

In [0]:
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.sql.functions import col

# 1. Configure the LSH model to work with our "features" column
brp = BucketedRandomProjectionLSH(
    inputCol="features",       # The ML vector column from our Gold table
    outputCol="hashes",        # A new column for hash values
    bucketLength=2.0,
    numHashTables=3
)

# 2. Fit the LSH model to our data
lsh_model = brp.fit(gold_df)

# 3. Transform our data to add the 'hashes' column
# We remove .cache() because it is not supported on Serverless
lsh_df = lsh_model.transform(gold_df)

print("LSH model is ready for finding similar items.")

In [0]:
from pyspark.sql.functions import col, lower

def get_recommendations_by_name(target_name_query, num_recs=5):
    """
    Finds a product by a search query and then finds similar products.
    """
    
    # 1. Find the product by its name (using a 'like' query to find a partial match)
    print(f"Searching for product matching: '{target_name_query}'...")
    
    target_product = lsh_df.filter(
        lower(col("title")).like(f"%{target_name_query.lower()}%")
    ).select("features", "title", "asin").first()
    
    # 2. Check if we found a product
    if not target_product:
        print(f"No product found with a name like '{target_name_query}'.")
        return None
    
    target_vector = target_product.features
    target_asin = target_product.asin
    
    print(f"Found base product: {target_product.title} (ASIN: {target_asin})")
    print("--- Finding similar items... ---")
    
    # 3. Use the LSH model to find approximate nearest neighbors
    similar_products_df = lsh_model.approxNearestNeighbors(
        lsh_df,
        target_vector,
        num_recs + 1  # Get N+1 because it will find itself
    )
    
    # 4. Filter out the original product (it will find itself)
    results_df = similar_products_df.filter(col("asin") != target_asin)
    
    # 5. Select and display the results
    final_recs = results_df.select(
        "asin",
        "title",
        "price",
        "stars",
        "distCol"  # Lower distance is more similar
    ).orderBy("distCol").limit(num_recs)
    
    return final_recs

print("Recommendation function 'get_recommendations_by_name' is defined.")

In [0]:
# Test the new function using a name query
recs_df = get_recommendations_by_name(target_name_query="sion softside", num_recs=5)

if recs_df:
    display(recs_df)