In [0]:
df = spark.read.format("delta").load("/Volumes/workspace/ecommerce/silver/events_delta")
df.write.mode("overwrite").saveAsTable("workspace.default.ecommerce_events")

In [0]:
%sql
SELECT COUNT(*) FROM workspace.default.ecommerce_events

COUNT(*)
67401460


In [0]:
from pyspark.sql.functions import concat_ws, col
nlp_df = spark.table("workspace.default.ecommerce_events").select("event_type", concat_ws(" ", col("brand"), col("category_code")).alias("text")).dropna()
nlp_df.show(3)

+----------+--------------------+
|event_type|                text|
+----------+--------------------+
|      view|neoline auto.acce...|
|      view|furniture.bedroom...|
|      view|            cordiant|
+----------+--------------------+
only showing top 3 rows


In [0]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer
label_indexer = StringIndexer(inputCol="event_type", outputCol="label")
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tf = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=1000)
idf = IDF(inputCol="raw_features", outputCol="features")

In [0]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

In [0]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[label_indexer, tokenizer, tf, idf, lr])

In [0]:
train_df, test_df = nlp_df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train_df)
preds = model.transform(test_df)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(preds)
print("NLP Classification Accuracy:", accuracy)

NLP Classification Accuracy: 0.9429696462781593


In [0]:
preds.groupBy("event_type", "prediction").count().show()

+----------+----------+--------+
|event_type|prediction|   count|
+----------+----------+--------+
|  purchase|       0.0|  183039|
|      cart|       0.0|  585650|
|      view|       0.0|12710299|
+----------+----------+--------+



In [0]:
from pyspark.ml.feature import IndexToString
converter = IndexToString(inputCol="prediction", outputCol="predicted_event", labels=model.stages[0].labels)
final_preds = converter.transform(preds)
final_preds.select("text","event_type","predicted_event").show(10, False)

+----+----------+---------------+
|text|event_type|predicted_event|
+----+----------+---------------+
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
|    |cart      |view           |
+----+----------+---------------+
only showing top 10 rows


In [0]:
from pyspark.sql.functions import col, count, when
funnel = (df.groupBy("event_type").agg(count("*").alias("cnt")))
funnel.show()
# Compute conversion rates
views = df.filter(col("event_type")=="view").count()
carts = df.filter(col("event_type")=="cart").count()
purchases = df.filter(col("event_type")=="purchase").count()
view_to_cart = carts / views
cart_to_purchase = purchases / carts
print("View → Cart rate:", view_to_cart)
print("Cart → Purchase rate:", cart_to_purchase)
# Auto Insight Text
if view_to_cart < 0.05:
    print("Insight: Low add-to-cart rate — product pages need improvement.")
else:
    print("Insight: Healthy add-to-cart ratio observed.")

+----------+--------+
|event_type|     cnt|
+----------+--------+
|  purchase|  916930|
|      cart| 2930018|
|      view|63554512|
+----------+--------+

View → Cart rate: 0.04610243880088325
Cart → Purchase rate: 0.31294346997185685
Insight: Low add-to-cart rate — product pages need improvement.


In [0]:
from pyspark.sql.functions import sum, round
from pyspark.sql.types import DecimalType
# brand_rev = (df.filter(col("event_type") == "purchase").groupBy("brand").agg(round(sum("price"), 3).alias("revenue")).orderBy(col("revenue").desc()))
brand_rev = (df.filter(col("event_type") == "purchase").groupBy("brand").agg(sum("price").cast(DecimalType(18,3)).alias("revenue")).orderBy(col("revenue").desc()))
brand_rev.show(3)
# Generate Insight Statement
top_brand = brand_rev.first()["brand"]
print(f"Insight: Brand '{top_brand}' contributes the highest purchase revenue.")

+-------+-------------+
|  brand|      revenue|
+-------+-------------+
|  apple|127512524.880|
|samsung| 54869650.970|
| xiaomi| 11259845.910|
+-------+-------------+
only showing top 3 rows
Insight: Brand 'apple' contributes the highest purchase revenue.


In [0]:
from pyspark.sql.functions import hour
hourly = (df.withColumn("event_hour", hour("event_time")).groupBy("event_hour").count().orderBy("count", ascending=False))
display(hourly.limit(3))

peak_hour = hourly.first()["event_hour"]
print(f"Insight: User activity peaks around hour {peak_hour}.")

event_hour,count
16,4504001
15,4443369
17,4410648


Insight: User activity peaks around hour 16.


In [0]:
user_behavior = (df.groupBy("user_id").agg(count(when(col("event_type")=="view", True)).alias("views"), count(when(col("event_type")=="purchase", True)).alias("purchases")))
high_view_low_buy = user_behavior.filter((col("views") > 20) & (col("purchases")==0)).count()
print("Users with many views but no purchases:", high_view_low_buy)

if high_view_low_buy > 0:
    print("Insight: There is a segment of users who browse heavily but do not purchase.")

Users with many views but no purchases: 522706
Insight: There is a segment of users who browse heavily but do not purchase.


In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import when, col, avg
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

# 1️⃣ Create label: purchase = 1 else 0
df_ai = df.withColumn("label", when(col("event_type") == "purchase", 1).otherwise(0))

# 2️⃣ Small feature set for fast training
brand_indexer = StringIndexer(inputCol="brand", outputCol="brand_index", handleInvalid="keep")
assembler = VectorAssembler(inputCols=["price", "brand_index"], outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
pipeline = Pipeline(stages=[brand_indexer, assembler, lr])

# 3️⃣ Train / test split
train, test = df_ai.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train)
preds = model.transform(test)

# 4️⃣ Extract purchase probability safely (vector → scalar)
get_purchase_prob = udf(lambda v: float(v[1]), DoubleType())
preds2 = preds.withColumn("purchase_prob", get_purchase_prob(col("probability")))

# 5️⃣ Aggregate into AI insight metric
brand_scores = (preds2.groupBy("brand").agg(avg("purchase_prob").alias("purchase_probability")).orderBy(col("purchase_probability").desc()))
brand_scores.show(5)

# 6️⃣ Auto AI insight statement
top_row = brand_scores.orderBy(col("purchase_probability").desc()).first()
brand_name = top_row["brand"]
prob_value = float(top_row["purchase_probability"])
print("AI Insight: Brand '{}' shows highest predicted purchase probability ({:.3f}).".format(brand_name, prob_value))

+--------+--------------------+
|   brand|purchase_probability|
+--------+--------------------+
| respect|0.015004425522238758|
|  xiaomi| 0.01500386899698927|
|cordiant|0.015001611207727705|
| lucente|0.014992213486940444|
|  huawei|0.014991325389491477|
+--------+--------------------+
only showing top 5 rows
AI Insight: Brand 'respect' shows highest predicted purchase probability (0.015).
