In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Load events DataFrame from the table
events = spark.read.table("workspace.default.events_table")

# Descriptive stats
events.describe(["price"]).show()

# Hypothesis: weekday vs weekend conversion
weekday = events.withColumn("is_weekend",
    F.dayofweek("event_time").isin([1,7]))
weekday.groupBy("is_weekend", "event_type").count().show()


+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67501979|
|   mean| 292.4593165647889|
| stddev|355.67449958606727|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+

+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|      true|      view|23102117|
|      true|      cart| 1229688|
|      true|  purchase|  416681|
|     false|      view|40453993|
|     false|      cart| 1799242|
|     false|  purchase|  500258|
+----------+----------+--------+



In [0]:
# Feature engineering
features = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_time")) \
    .withColumn("price_log", F.log(F.col("price")+1)) \
    .withColumn("time_since_first_view",
        F.unix_timestamp("event_time") -
        F.first("event_time").over(Window.partitionBy("user_id").orderBy("event_time")))