In [0]:
events = spark.table("ecommercetest.silver.silverevents")

In [0]:
# Descriptive stats
events.describe(["price"]).show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|         109511637|
|   mean| 292.3415351479573|
| stddev|356.88400280414055|
|    min|              0.77|
|    max|           2574.07|
+-------+------------------+



In [0]:
# Hypothesis: weekday vs weekend conversion
from pyspark.sql import functions as F
weekday = events.withColumn("is_weekend",
    F.dayofweek("event_date").isin([1,7]))
weekday.groupBy("is_weekend", "event_type").count().show()

+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|     false|  purchase| 1046615|
|     false|      view|70038358|
|      true|      view|34012230|
|     false|      cart| 2371012|
|      true|      cart| 1430347|
|      true|  purchase|  613075|
+----------+----------+--------+



In [0]:
# Conversion rate by category
pivot_df = events.groupBy("category_code") \
    .pivot("event_type") \
    .count() \
    .withColumn(
        "conversion_rate",
        F.col("purchase") / F.col("view") * 100
    )

In [0]:
pivot_df.show()

+--------------------+------+--------+--------+-------------------+
|       category_code|  cart|purchase|    view|    conversion_rate|
+--------------------+------+--------+--------+-------------------+
| stationery.cartrige|   736|     325|   19200| 1.6927083333333333|
|electronics.video.tv|136460|   51834| 3118576|  1.662104755503794|
|  accessories.wallet|  1094|     676|  112015| 0.6034906039369727|
|appliances.kitche...|  3062|    1287|  118080| 1.0899390243902438|
|                NULL|893704|  407620|33932163|  1.201279152171938|
|construction.tool...|  6987|    2200|  232092| 0.9478999707012736|
|appliances.enviro...| 15538|    6066|  422015| 1.4373896662440908|
|country_yard.furn...|    17|       4|    2781| 0.1438331535418914|
|       apparel.shoes| 38321|   14395| 2577465| 0.5584944897408888|
|electronics.audio...|  1763|     919|   72700| 1.2640990371389271|
|       apparel.glove|    66|      17|    3371| 0.5043013942450312|
|appliances.kitche...|  4674|    1658|  237609| 

In [0]:
# Correlation
pivot_df.stat.corr("purchase", "conversion_rate")

0.35779230892870917

In [0]:
from pyspark.sql.window import Window
# Feature engineering
features = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price")+1)) \
    .withColumn("time_since_first_view",
        F.unix_timestamp("event_time") -
        F.first("event_time").over(Window.partitionBy("user_id").orderBy("event_time")))