In [0]:
from pyspark.sql.functions import col, when

silver_table_name = "workspace.ecommerce.user_features_silver"
df_silver = spark.table(silver_table_name)

df_features = df_silver.select("user_id", "total_interactions", "total_views", "total_cart_adds")

# Binary Purchase Label
df_labels = (
    df_silver.select("user_id", "total_purchases")
    .withColumn("label", when(col("total_purchases") > 0, 1).otherwise(0))
    .drop("total_purchases")
)

In [0]:
# Join Features with Label table
df_model_ready = df_features.join(df_labels, on="user_id", how="inner")

display(df_model_ready.limit(5))

In [0]:
# Train/Test Split
train_df, test_df = df_model_ready.randomSplit([0.8, 0.2], seed=42)

print(f"Total Dataset: {df_model_ready.count()} rows")
print(f"Training Data: {train_df.count()} rows")
print(f"Testing Data:  {test_df.count()} rows")

In [0]:
df_model_ready.printSchema()

In [0]:
from pyspark.sql.functions import col

def check_distribution(df, dataset_name):
    total = df.count()
    buyers = df.filter(col("label") == 1).count()
    percentage = (buyers / total) * 100 if total > 0 else 0
    
    print(f"{dataset_name} -> Total: {total} | Buyers: {buyers} | Buyer %: {percentage:.2f}%")

check_distribution(df_model_ready, "Overall Data")
check_distribution(train_df, "Training Data")
check_distribution(test_df, "Testing Data")