In [0]:
df = spark.read.format("delta").load("/Volumes/workspace/ecommerce/silver/events_delta")
df.show(5)

+-------------------+----------+----------+-------------------+--------------------+-------------------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|              brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------------------+------+---------+--------------------+
|2019-11-17 08:43:20|      view|   4700422|2053013560899928785|auto.accessories....|            neoline|110.66|544387809|599bce05-c8a4-4c1...|
|2019-11-17 08:43:30|      view|  13200861|2053013557192163841|furniture.bedroom...|               NULL|215.19|566786243|aacc2067-7a61-4eb...|
|2019-11-17 08:43:33|      view|  12703498|2053013553559896355|                NULL|           cordiant| 43.24|514402782|70935233-f586-4f7...|
|2019-11-17 08:43:44|      view|  17303320|2053013553853497655|                NULL|initioparfumsprives| 16.73|514964049|465082e7-77f8-4ba...|

In [0]:
df.select("price").describe().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67401460|
|   mean|292.48192396283747|
| stddev| 355.7357612417038|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+



In [0]:
# H₀: Average price is the same on weekdays and weekends
# H₁: Average price is different

from pyspark.sql.functions import col, when, dayofweek
import scipy.stats as stats

df_day = df.withColumn("day_type", when(dayofweek("event_time").isin([1,7]), "Weekend").otherwise("Weekday"))

weekday_prices = [row.price for row in df_day.filter(col("day_type") == "Weekday").select("price").toLocalIterator()]
weekend_prices = [row.price for row in df_day.filter(col("day_type") == "Weekend").select("price").toLocalIterator()]

t_stat, p_value = stats.ttest_ind(weekday_prices, weekend_prices, equal_var=False)
print("T-statistic:", t_stat)
print("P-value:", p_value)

# If p-value < 0.05 → difference is statistically significant

T-statistic: -39.19706077506162
P-value: 0.0


In [0]:
from pyspark.sql.functions import corr

df.select(corr("price", "product_id").alias("price_product_corr"), corr("price", "user_id").alias("price_user_corr")).show()

+--------------------+--------------------+
|  price_product_corr|     price_user_corr|
+--------------------+--------------------+
|-0.18434827499965797|-0.00824957386814...|
+--------------------+--------------------+



In [0]:
from pyspark.sql.functions import hour, dayofweek, log, when

df_features = df.withColumn("hour_of_day", hour("event_time")) \
    .withColumn("day_of_week", dayofweek("event_time")) \
    .withColumn("is_weekend", when(dayofweek("event_time").isin([1,7]), 1).otherwise(0)) \
    .withColumn("log_price", log(col("price") + 1))

df_features.show(5)

+-------------------+----------+----------+-------------------+--------------------+-------------------+------+---------+--------------------+-----------+-----------+----------+------------------+
|         event_time|event_type|product_id|        category_id|       category_code|              brand| price|  user_id|        user_session|hour_of_day|day_of_week|is_weekend|         log_price|
+-------------------+----------+----------+-------------------+--------------------+-------------------+------+---------+--------------------+-----------+-----------+----------+------------------+
|2019-11-17 08:43:20|      view|   4700422|2053013560899928785|auto.accessories....|            neoline|110.66|544387809|599bce05-c8a4-4c1...|          8|          1|         1| 4.715458539882214|
|2019-11-17 08:43:30|      view|  13200861|2053013557192163841|furniture.bedroom...|               NULL|215.19|566786243|aacc2067-7a61-4eb...|          8|          1|         1| 5.376157650666373|
|2019-11-17 08: