In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
events = spark.table("silver.events")

In [0]:
events = events.withColumn("event_time", F.to_timestamp("event_time"))
events = events.withColumn("event_date", F.to_date("event_time"))

### Calculate statistical summaries

In [0]:
events.select("price").describe().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          42341904|
|   mean|290.78126383900087|
| stddev| 358.3993774731591|
|    min|              0.77|
|    max|           2574.07|
+-------+------------------+



In [0]:
events.selectExpr(
    "percentile_approx(price, 0.5) as median_price",
    "percentile_approx(price, 0.25) as p25",
    "percentile_approx(price, 0.75) as p75",
    "percentile_approx(price, 0.95) as p95"
).show()

+------------+-----+------+-------+
|median_price|  p25|   p75|    p95|
+------------+-----+------+-------+
|       163.9|66.64|358.57|1011.31|
+------------+-----+------+-------+



In [0]:
events.groupBy("event_type").count().orderBy(F.desc("count")).show()

+----------+--------+
|event_type|   count|
+----------+--------+
|      view|40703574|
|      cart|  895563|
|  purchase|  742767|
+----------+--------+



### Hypothesis test (Weekday vs Weekend conversion)

In [0]:
events2 = events.withColumn(
    "is_weekend",
    F.dayofweek("event_date").isin([1, 7])
)

In [0]:
session_conv = events2.groupBy("user_session", "is_weekend").agg(
    F.max(F.when(F.col("event_type")=="purchase", 1).otherwise(0)).alias("converted")
)
session_conv.groupBy("is_weekend").agg(
    F.count("*").alias("sessions"),
    F.sum("converted").alias("converted_sessions"),
    (F.sum("converted")/F.count("*")*100).alias("conversion_rate_pct")
).show()

+----------+--------+------------------+-------------------+
|is_weekend|sessions|converted_sessions|conversion_rate_pct|
+----------+--------+------------------+-------------------+
|      true| 2423728|            166698|  6.877751958965693|
|     false| 6827690|            462905|  6.779818650231631|
+----------+--------+------------------+-------------------+



In [0]:
cont = session_conv.groupBy("is_weekend").agg(
    F.sum("converted").alias("converted"),
    (F.count("*") - F.sum("converted")).alias("not_converted")
)
cont.show()

+----------+---------+-------------+
|is_weekend|converted|not_converted|
+----------+---------+-------------+
|      true|   166698|      2257030|
|     false|   462905|      6364785|
+----------+---------+-------------+



In [0]:
rates = session_conv.groupBy("is_weekend").agg(
    (F.sum("converted")/F.count("*")).alias("rate")
)
rates.show()

+----------+-------------------+
|is_weekend|               rate|
+----------+-------------------+
|      true|0.06877751958965693|
|     false|0.06779818650231631|
+----------+-------------------+



In [0]:
rows = cont.orderBy("is_weekend").collect()
# rows[0] = weekday (False), rows[1] = weekend (True)
obs = [
    [rows[0]["converted"], rows[0]["not_converted"]],
    [rows[1]["converted"], rows[1]["not_converted"]]
]
obs

[[462905, 6364785], [166698, 2257030]]

### Identify correlations

In [0]:
events_num = events2.withColumn("hour", F.hour("event_time")) \
                    .withColumn("day_of_week", F.dayofweek("event_date"))

events_num.stat.corr("price", "hour")
events_num.stat.corr("price", "day_of_week")

-0.000798784465567132

In [0]:
cat = events2.groupBy("category_code").agg(
    F.sum(F.when(F.col("event_type")=="purchase", 1).otherwise(0)).alias("purchases"),
    F.sum(F.when(F.col("event_type")=="view", 1).otherwise(0)).alias("views"),
    F.avg("price").alias("avg_price")
).withColumn(
    "conversion_rate",
    F.when(F.col("views")>0, F.col("purchases")/F.col("views")).otherwise(None)
)

cat.select("category_code","avg_price","conversion_rate").show(20, False)
cat.stat.corr("avg_price", "conversion_rate")

+-----------------------------------+------------------+---------------------+
|category_code                      |avg_price         |conversion_rate      |
+-----------------------------------+------------------+---------------------+
|stationery.cartrige                |25.95512667462529 |0.018358679271133032 |
|electronics.video.tv               |442.3844076004291 |0.020442101570536674 |
|accessories.wallet                 |59.9467772282923  |0.0071817444688984135|
|appliances.kitchen.juicer          |110.3531958154057 |0.011986931215786398 |
|NULL                               |185.43915420679804|0.013139378689670347 |
|construction.tools.welding         |223.34948229376673|0.011888788685304533 |
|appliances.environment.air_heater  |51.492009398556   |0.01621254562431033  |
|country_yard.furniture.hammok      |133.7541433278419 |0.0                  |
|apparel.shoes                      |89.81699374682195 |0.005616442514951881 |
|electronics.audio.microphone       |142.63830651288

0.07706796402274528

### feature Engg for ML

In [0]:
events_fe = events2.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price") + 1))

In [0]:
# Time since first event in session (more stable than per user)
w_sess = Window.partitionBy("user_session").orderBy("event_time")
events_fe = events_fe.withColumn(
    "time_since_first_event",
    F.unix_timestamp("event_time") - F.unix_timestamp(F.first("event_time").over(w_sess))
)

In [0]:
train = events_fe.groupBy("user_session").agg(
    F.max(F.when(F.col("event_type")=="purchase", 1).otherwise(0)).alias("label_purchase"),
    F.count("*").alias("num_events"),
    F.sum(F.when(F.col("event_type")=="view", 1).otherwise(0)).alias("num_views"),
    F.sum(F.when(F.col("event_type")=="cart", 1).otherwise(0)).alias("num_carts"),
    F.sum(F.when(F.col("event_type")=="purchase", 1).otherwise(0)).alias("num_purchases"),
    F.avg("price").alias("avg_price"),
    F.max("price").alias("max_price"),
    F.avg("price_log").alias("avg_price_log"),
    F.max("time_since_first_event").alias("session_duration_sec"),
    F.max(F.col("is_weekend").cast("int")).alias("is_weekend"),
    F.avg("hour").alias("avg_hour"),
    F.max("day_of_week").alias("day_of_week")  # or mode if you want
)
train.show(5)

+--------------------+--------------+----------+---------+---------+-------------+------------------+---------+------------------+--------------------+----------+--------+-----------+
|        user_session|label_purchase|num_events|num_views|num_carts|num_purchases|         avg_price|max_price|     avg_price_log|session_duration_sec|is_weekend|avg_hour|day_of_week|
+--------------------+--------------+----------+---------+---------+-------------+------------------+---------+------------------+--------------------+----------+--------+-----------+
|00001417-945d-4ab...|             0|         1|        1|        0|            0|            252.21|   252.21| 5.534219183960909|                   0|         0|    19.0|          2|
|000089a1-ee47-49e...|             0|         1|        1|        0|            0|            138.74|   138.74| 4.939783553124305|                   0|         1|    10.0|          7|
|00009359-f7d4-49c...|             0|         6|        6|        0|            

In [0]:
train.select(
    F.count("*").alias("rows"),
    F.mean("label_purchase").alias("purchase_rate")
).show()

train.describe(["num_events","avg_price","session_duration_sec"]).show()

+-------+------------------+
|   rows|     purchase_rate|
+-------+------------------+
|9239403|0.0681384933636946|
+-------+------------------+

+-------+----------------+------------------+--------------------+
|summary|      num_events|         avg_price|session_duration_sec|
+-------+----------------+------------------+--------------------+
|  count|         9239403|           9239403|             9239403|
|   mean|4.58275323632923|313.96716796282453|   1041.308879155937|
| stddev|6.74585573708577| 353.3907835750274|   22508.06162304288|
|    min|               1|              0.77|                   0|
|    max|            1158|           2574.07|             2640529|
+-------+----------------+------------------+--------------------+



In [0]:
events.count()
events.select("event_time","event_type","price").show(5, False)

+-------------------+----------+-------+
|event_time         |event_type|price  |
+-------------------+----------+-------+
|2019-10-13 06:26:04|view      |80.31  |
|2019-10-13 06:27:25|view      |230.78 |
|2019-10-13 06:27:27|view      |1541.58|
|2019-10-13 06:28:33|view      |21.48  |
|2019-10-13 06:29:05|view      |581.12 |
+-------------------+----------+-------+
only showing top 5 rows


In [0]:
# 1) Weekend summary
weekend_summary = session_conv.groupBy("is_weekend").agg(
    F.count("*").alias("sessions"),
    F.sum("converted").alias("converted_sessions"),
    (F.sum("converted")/F.count("*")*100).alias("conversion_rate_pct")
)

weekend_summary.write.mode("overwrite").saveAsTable("silver.weekend_conversion_summary")

# 2) Category metrics
cat.write.mode("overwrite").saveAsTable("silver.category_metrics")

# 3) ML features table
train.write.mode("overwrite").saveAsTable("silver.ml_train_features")


In [0]:
train.select(
    F.count("*").alias("rows"),
    F.mean("label_purchase").alias("purchase_rate")
).show()

train.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in train.columns]).show()

+-------+------------------+
|   rows|     purchase_rate|
+-------+------------------+
|9239403|0.0681384933636946|
+-------+------------------+

+------------+--------------+----------+---------+---------+-------------+---------+---------+-------------+--------------------+----------+--------+-----------+
|user_session|label_purchase|num_events|num_views|num_carts|num_purchases|avg_price|max_price|avg_price_log|session_duration_sec|is_weekend|avg_hour|day_of_week|
+------------+--------------+----------+---------+---------+-------------+---------+---------+-------------+--------------------+----------+--------+-----------+
|           1|             0|         0|        0|        0|            0|        0|        0|            0|                   0|         0|       0|          0|
+------------+--------------+----------+---------+---------+-------------+---------+---------+-------------+--------------------+----------+--------+-----------+

