### Import library

In [0]:
%pip install faiss-cpu

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType, StructType, StructField, LongType
import numpy as np
import pandas as pd
import faiss
import gc

### Configuation

In [0]:
GOLD_DATABASE = "`bigdata-and-bi`.gold"

# Input tables
ITEM_VECTORS_TABLE = f"{GOLD_DATABASE}.star_item_vectors"

# Output tables
SEMANTIC_MATRIX_TABLE = f"{GOLD_DATABASE}.star_semantic_matrix"

# Top-K parameters
TOP_K_NEIGHBORS = 200  # Chỉ lấy 200 neighbors gần nhất cho mỗi item
VECTOR_DIMENSION = 384  # Số chiều của vector (cần điều chỉnh theo dữ liệu thực tế)

print("🔧 Configuration:")
print(f"  Top-K neighbors: {TOP_K_NEIGHBORS}")
print(f"  Vector dimension: {VECTOR_DIMENSION}")

🔧 Configuration:
  Top-K neighbors: 200
  Vector dimension: 384


In [0]:
def compute_semantic_matrix_table_exists():
    """Check if R_S matrix already computed"""
    try:
        spark.table(SEMANTIC_MATRIX_TABLE).limit(1).count()
        return True
    except:
        return False

print("Helper function defined")

✅ Helper function defined


### Precompute sematic similarity maxtrix R_S

In [0]:
print("\n" + "="*60)
print("Precompute R_S - FAISS Full Dataset with Proper Pagination")
print("="*60)

if compute_semantic_matrix_table_exists():
    print(f"R_S matrix already exists: {SEMANTIC_MATRIX_TABLE}")
else:
    print(f"Computing R_S matrix for FULL dataset with proper pagination...")
    
    item_vectors_df = spark.table(ITEM_VECTORS_TABLE).select("item_id", "vector")
    total_count = item_vectors_df.count()
    print(f"Processing FULL dataset: {total_count:,} items")
    
    window_spec = Window.orderBy("item_id")
    item_vectors_numbered = item_vectors_df.withColumn("row_num", F.row_number().over(window_spec))
    
    # Memory-optimized batch processing
    BATCH_SIZE = 10000
    NUM_BATCHES = (total_count + BATCH_SIZE - 1) // BATCH_SIZE
    
    print(f"Processing in {NUM_BATCHES} batches of {BATCH_SIZE} items...")
    
    print("Building FAISS index from batches...")
    
    sample_row = item_vectors_df.limit(1).collect()[0]
    dimension = len(sample_row['vector'].toArray())
    print(f"Vector dimension: {dimension}")
    
    index = faiss.IndexFlatIP(dimension)
    all_item_ids = []
    
    for batch_idx in range(NUM_BATCHES):
        start_idx = batch_idx * BATCH_SIZE
        end_idx = start_idx + BATCH_SIZE
        
        batch_df = item_vectors_numbered.filter(
            (F.col("row_num") > start_idx) & (F.col("row_num") <= end_idx)
        ).select("item_id", "vector")
        
        batch_data = batch_df.collect()
        
        if not batch_data:
            print(f"Batch {batch_idx + 1} is empty, skipping...")
            continue
            
        print(f"Batch {batch_idx + 1}: {len(batch_data)} items")
        
        batch_vectors_list = []
        batch_item_ids = []
        
        for row in batch_data:
            vector_array = row['vector'].toArray()
            if len(vector_array) == dimension:
                batch_vectors_list.append(vector_array)
                batch_item_ids.append(row['item_id'])
        
        if not batch_vectors_list:
            print(f"No valid vectors in batch {batch_idx + 1}, skipping...")
            continue
            
        batch_vectors = np.array(batch_vectors_list, dtype='float32')
        print(f"Batch {batch_idx + 1} vectors shape: {batch_vectors.shape}")
        
        faiss.normalize_L2(batch_vectors)
        index.add(batch_vectors)
        all_item_ids.extend(batch_item_ids)
        
        print(f"Added batch {batch_idx + 1}/{NUM_BATCHES} to index - {len(batch_vectors)} vectors")
        
        del batch_vectors, batch_data, batch_vectors_list
        if batch_idx % 10 == 0: 
            gc.collect()
    
    print(f"🎯 FAISS index built with {index.ntotal} vectors")
    print(f"📊 Total valid item IDs collected: {len(all_item_ids)}")
    
    print("🔍 Finding similarities batch by batch...")
    all_results = []
    
    for batch_idx in range(NUM_BATCHES):
        start_idx = batch_idx * BATCH_SIZE
        end_idx = start_idx + BATCH_SIZE
        
        batch_df = item_vectors_numbered.filter(
            (F.col("row_num") > start_idx) & (F.col("row_num") <= end_idx)
        ).select("item_id", "vector")
        
        batch_data = batch_df.collect()
        
        if not batch_data:
            continue
            
        batch_item_ids = []
        batch_vectors_list = []
        
        for row in batch_data:
            vector_array = row['vector'].toArray()
            if len(vector_array) == dimension:
                batch_item_ids.append(row['item_id'])
                batch_vectors_list.append(vector_array)
        
        if not batch_vectors_list:
            print(f"⚠️  No valid vectors in batch {batch_idx + 1} for search, skipping...")
            continue
            
        batch_vectors = np.array(batch_vectors_list, dtype='float32')
        faiss.normalize_L2(batch_vectors)
        
        k = TOP_K_NEIGHBORS + 1
        batch_similarities, batch_indices = index.search(batch_vectors, k)
        
        batch_results = []
        for i in range(len(batch_vectors)):
            item_i = batch_item_ids[i]
            
            for rank in range(k):
                j_index = batch_indices[i][rank]
                similarity = batch_similarities[i][rank]
                
                if 0 <= j_index < len(all_item_ids) and similarity > 0.1:
                    item_j = all_item_ids[j_index]
                    if item_i != item_j:
                        batch_results.append((item_i, item_j, float(similarity)))
        
        all_results.extend(batch_results)
        
        del batch_vectors, batch_similarities, batch_indices, batch_data, batch_vectors_list
        gc.collect()
        
        print(f"✅ Processed batch {batch_idx + 1}/{NUM_BATCHES}: {len(batch_results):,} pairs")
    
    print(f"📈 Generated {len(all_results):,} similarity pairs total")
    
    print("🔄 Creating Spark DataFrame...")
    
    from pyspark.sql.types import StructType, StructField, StringType, FloatType
    
    schema = StructType([
        StructField("item_i", StringType(), True),
        StructField("item_j", StringType(), True),
        StructField("semantic_score", FloatType(), True)
    ])
    
    CHUNK_SIZE = 2000000
    num_chunks = (len(all_results) + CHUNK_SIZE - 1) // CHUNK_SIZE
    
    final_df = None
    for chunk_idx in range(num_chunks):
        start_idx = chunk_idx * CHUNK_SIZE
        end_idx = min((chunk_idx + 1) * CHUNK_SIZE, len(all_results))
        chunk_results = all_results[start_idx:end_idx]
        
        chunk_df = spark.createDataFrame(chunk_results, schema)
        
        if final_df is None:
            final_df = chunk_df
        else:
            final_df = final_df.union(chunk_df)
        
        print(f"Created DataFrame chunk {chunk_idx + 1}/{num_chunks}")
        del chunk_df, chunk_results
        gc.collect()
    
    # top-k neighbors
    print("🏆 Selecting top-K neighbors...")
    window_spec = Window.partitionBy("item_i").orderBy(F.desc("semantic_score"))
    semantic_matrix_topk = final_df.withColumn(
        "rank", F.row_number().over(window_spec)
    ).filter(F.col("rank") <= TOP_K_NEIGHBORS).drop("rank")
    
    final_count = semantic_matrix_topk.count()
    print(f"Final R_S matrix contains {final_count:,} pairs")
    
    distinct_items = semantic_matrix_topk.select("item_i").distinct().count()
    coverage = distinct_items / total_count * 100
    print(f"Coverage: {distinct_items:,}/{total_count:,} items ({coverage:.2f}%)")
    
    # Save
    print("Saving final R_S matrix...")
    semantic_matrix_topk.write \
        .format("delta") \
        .mode("overwrite") \
        .saveAsTable(SEMANTIC_MATRIX_TABLE)
    
    # Final cleanup
    del all_results, final_df, semantic_matrix_topk, index, all_item_ids, item_vectors_numbered
    gc.collect()
    
    print(f"Saved FULL R_S matrix to {SEMANTIC_MATRIX_TABLE}")
    print(f"Success! Processed {total_count:,} items with {final_count:,} similarity pairs")


Precompute R_S - FAISS Full Dataset with Proper Pagination
🔄 Computing R_S matrix for FULL dataset with proper pagination...
📊 Processing FULL dataset: 495,062 items
🔧 Processing in 50 batches of 10000 items...
🔧 Building FAISS index from batches...
📐 Vector dimension: 384




📦 Batch 1: 10000 items
📊 Batch 1 vectors shape: (10000, 384)
✅ Added batch 1/50 to index - 10000 vectors
📦 Batch 2: 10000 items
📊 Batch 2 vectors shape: (10000, 384)
✅ Added batch 2/50 to index - 10000 vectors
📦 Batch 3: 10000 items
📊 Batch 3 vectors shape: (10000, 384)
✅ Added batch 3/50 to index - 10000 vectors
📦 Batch 4: 10000 items
📊 Batch 4 vectors shape: (10000, 384)
✅ Added batch 4/50 to index - 10000 vectors
📦 Batch 5: 10000 items
📊 Batch 5 vectors shape: (10000, 384)
✅ Added batch 5/50 to index - 10000 vectors
📦 Batch 6: 10000 items
📊 Batch 6 vectors shape: (10000, 384)
✅ Added batch 6/50 to index - 10000 vectors
📦 Batch 7: 10000 items
📊 Batch 7 vectors shape: (10000, 384)
✅ Added batch 7/50 to index - 10000 vectors
📦 Batch 8: 10000 items
📊 Batch 8 vectors shape: (10000, 384)
✅ Added batch 8/50 to index - 10000 vectors
📦 Batch 9: 10000 items
📊 Batch 9 vectors shape: (10000, 384)
✅ Added batch 9/50 to index - 10000 vectors
📦 Batch 10: 10000 items
📊 Batch 10 vectors shape: (1000



📦 Batch 32: 10000 items
📊 Batch 32 vectors shape: (10000, 384)
✅ Added batch 32/50 to index - 10000 vectors
📦 Batch 33: 10000 items
📊 Batch 33 vectors shape: (10000, 384)
✅ Added batch 33/50 to index - 10000 vectors
📦 Batch 34: 10000 items
📊 Batch 34 vectors shape: (10000, 384)
✅ Added batch 34/50 to index - 10000 vectors
📦 Batch 35: 10000 items
📊 Batch 35 vectors shape: (10000, 384)
✅ Added batch 35/50 to index - 10000 vectors
📦 Batch 36: 10000 items
📊 Batch 36 vectors shape: (10000, 384)
✅ Added batch 36/50 to index - 10000 vectors
📦 Batch 37: 10000 items
📊 Batch 37 vectors shape: (10000, 384)
✅ Added batch 37/50 to index - 10000 vectors
📦 Batch 38: 10000 items
📊 Batch 38 vectors shape: (10000, 384)
✅ Added batch 38/50 to index - 10000 vectors
📦 Batch 39: 10000 items
📊 Batch 39 vectors shape: (10000, 384)
✅ Added batch 39/50 to index - 10000 vectors
📦 Batch 40: 10000 items
📊 Batch 40 vectors shape: (10000, 384)
✅ Added batch 40/50 to index - 10000 vectors
📦 Batch 41: 10000 items
📊 Ba



✅ Processed batch 3/50: 2,000,000 pairs
✅ Processed batch 4/50: 2,000,000 pairs




✅ Processed batch 5/50: 2,000,000 pairs
✅ Processed batch 6/50: 2,000,000 pairs
✅ Processed batch 7/50: 2,000,000 pairs
✅ Processed batch 8/50: 2,000,000 pairs
✅ Processed batch 9/50: 2,000,000 pairs




✅ Processed batch 10/50: 2,000,000 pairs
✅ Processed batch 11/50: 2,000,000 pairs
✅ Processed batch 12/50: 2,000,000 pairs
✅ Processed batch 13/50: 2,000,000 pairs
✅ Processed batch 14/50: 2,000,000 pairs




✅ Processed batch 15/50: 2,000,000 pairs
✅ Processed batch 16/50: 2,000,000 pairs




✅ Processed batch 17/50: 2,000,000 pairs
✅ Processed batch 18/50: 2,000,000 pairs




✅ Processed batch 19/50: 2,000,000 pairs
✅ Processed batch 20/50: 2,000,000 pairs




✅ Processed batch 21/50: 2,000,000 pairs




✅ Processed batch 22/50: 2,000,000 pairs




✅ Processed batch 23/50: 2,000,000 pairs
✅ Processed batch 24/50: 2,000,000 pairs




✅ Processed batch 25/50: 2,000,000 pairs
✅ Processed batch 26/50: 2,000,000 pairs
✅ Processed batch 27/50: 2,000,000 pairs




✅ Processed batch 28/50: 2,000,000 pairs
✅ Processed batch 29/50: 2,000,000 pairs
✅ Processed batch 30/50: 2,000,000 pairs
✅ Processed batch 31/50: 2,000,000 pairs




✅ Processed batch 32/50: 2,000,000 pairs
✅ Processed batch 33/50: 2,000,000 pairs




✅ Processed batch 34/50: 2,000,000 pairs




✅ Processed batch 35/50: 2,000,000 pairs




[0;31m---------------------------------------------------------------------------[0m
[0;31mUnknownException[0m                          Traceback (most recent call last)
File [0;32m<command-4857489474097822>, line 105[0m
[1;32m    100[0m [38;5;66;03m# Lấy batch sử dụng row_number()[39;00m
[1;32m    101[0m batch_df [38;5;241m=[39m item_vectors_numbered[38;5;241m.[39mfilter(
[1;32m    102[0m     (F[38;5;241m.[39mcol([38;5;124m"[39m[38;5;124mrow_num[39m[38;5;124m"[39m) [38;5;241m>[39m start_idx) [38;5;241m&[39m (F[38;5;241m.[39mcol([38;5;124m"[39m[38;5;124mrow_num[39m[38;5;124m"[39m) [38;5;241m<[39m[38;5;241m=[39m end_idx)
[1;32m    103[0m )[38;5;241m.[39mselect([38;5;124m"[39m[38;5;124mitem_id[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mvector[39m[38;5;124m"[39m)
[0;32m--> 105[0m batch_data [38;5;241m=[39m batch_df[38;5;241m.[39mcollect()
[1;32m    107[0m [38;5;28;01mif[39;00m [38;5;129;01mnot[39;00m batch_data:
[1;3

In [0]:
# Verify
print("\n" + "="*40)
print("Verification Results")
print("="*40)

semantic_matrix = spark.table(SEMANTIC_MATRIX_TABLE)

total_pairs = semantic_matrix.count()
distinct_items_i = semantic_matrix.select("item_i").distinct().count()
item_count = spark.table(ITEM_VECTORS_TABLE).count()

print(f"📈 Total pairs in R_S: {total_pairs:,}")
print(f"📊 Distinct items in item_i: {distinct_items_i:,}")
print(f"📊 Total items: {item_count:,}")
print(f"📊 Coverage: {distinct_items_i/item_count*100:.2f}%")

# similarity scores
print("\nSemantic score distribution:")
stats = semantic_matrix.select(
    F.min("semantic_score").alias("min_score"),
    F.max("semantic_score").alias("max_score"),
    F.mean("semantic_score").alias("mean_score"),
    F.stddev("semantic_score").alias("std_score")
).collect()[0]

print(f"  Min score: {stats['min_score']:.4f}")
print(f"  Max score: {stats['max_score']:.4f}")
print(f"  Mean score: {stats['mean_score']:.4f}")
print(f"  Std score: {stats['std_score']:.4f}")

print("\n🔍 Sample of R_S matrix:")
semantic_matrix.orderBy(F.rand()).limit(10).show()

avg_neighbors = semantic_matrix.groupBy("item_i").count().select(F.avg("count")).collect()[0][0]
print(f"📊 Average neighbors per item: {avg_neighbors:.2f}")

