In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
events = spark.table("ecom_idc.silver.events")

In [0]:
events.display()

In [0]:
events = events.withColumn(
    "conversion_rate",
    F.round(
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).over(Window.partitionBy("event_date")) /
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).over(Window.partitionBy("event_date")),
        2
    )
)

# Descriptive stats

In [0]:
events.describe(["price"]).show()

# Hypothesis: weekday vs weekend conversion

In [0]:
weekday = events.withColumn("is_weekend",F.dayofweek("event_date").isin([1,7]))
weekday.groupBy("is_weekend","event_type").count().show()

# Correlation

In [0]:
events.stat.corr("price", "conversion_rate")

# Feature engineering

In [0]:
features = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price")+1)) \
    .withColumn("time_since_first_view",
        F.unix_timestamp("event_time") -
        F.unix_timestamp(F.first("event_time").over(Window.partitionBy("user_id").orderBy("event_time")))
    )

In [0]:
features.display()