In [0]:
%sql
-- Describe Silver table
DESCRIBE TABLE ecommerce.silver.daily_sales;



col_name,data_type,comment
event_date,date,
event_type,string,
total_events,bigint,
total_revenue,double,


In [0]:
%sql
-- Describe Bronze table
DESCRIBE TABLE ecommerce.bronze.events;



col_name,data_type,comment
event_time,timestamp,
event_type,string,
product_id,int,
category_id,bigint,
category_code,string,
brand,string,
price,double,
user_id,int,
user_session,string,
ingestion_time,timestamp,


In [0]:
%sql
-- Describe Gold tables
DESCRIBE TABLE ecommerce.gold.products;

DESCRIBE TABLE ecommerce.gold.top_products;

col_name,data_type,comment
event_date,date,
event_type,string,
total_events,bigint,
total_revenue,double,


- ## Calculate statistical summaries

In [0]:


events = spark.table("ecommerce.bronze.events")

In [0]:

events.describe(["price"]).show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67501979|
|   mean| 292.4593165645631|
| stddev|355.67449958606727|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+




- ## Hypothesis Testing (Weekday vs Weekend)

In [0]:


from pyspark.sql import functions as F

weekday = events.withColumn(
    "event_date", F.to_date("event_time")
).withColumn(
    "is_weekend",
    F.dayofweek("event_date").isin([1, 7])
)



In [0]:
weekday.groupBy("is_weekend", "event_type").count()\
.orderBy("is_weekend", "event_type") \
.show()


+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|     false|      cart| 1799242|
|     false|  purchase|  500258|
|     false|      view|40453993|
|      true|      cart| 1229688|
|      true|  purchase|  416681|
|      true|      view|23102117|
+----------+----------+--------+




- ## Identify Correlations


In [0]:

events = events.withColumn(
    "conversion_rate",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

In [0]:
events.stat.corr("price", "conversion_rate")

0.0025286683578105845


- ## Feature Engineering for Machine Learning

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

features = (
    events
    # Create event_date FIRST
    .withColumn("event_date", F.to_date("event_time"))
    
    .withColumn("hour", F.hour("event_time"))
    .withColumn("day_of_week", F.dayofweek("event_date"))
    .withColumn("price_log", F.log(F.col("price") + 1))
    .withColumn(
        "time_since_first_view",
        F.unix_timestamp("event_time") -
        F.unix_timestamp(
            F.first("event_time").over(
                Window.partitionBy("user_id").orderBy("event_time")
            )
        )
    )
)



In [0]:
features.select(
    "user_id",
    "product_id",
    "brand",
    "hour",
    "day_of_week",
    "price_log",
    "time_since_first_view"
).show(10)


+---------+----------+-------+----+-----------+------------------+---------------------+
|  user_id|product_id|  brand|hour|day_of_week|         price_log|time_since_first_view|
+---------+----------+-------+----+-----------+------------------+---------------------+
| 65800726|  22300003| xiaomi|   4|          4| 4.416428061391214|                    0|
| 65800726|  22300003| xiaomi|   4|          4| 4.416428061391214|                  128|
| 81255481|  16400235|bergner|   7|          6| 4.209902902856373|                    0|
| 81255481|  16400235|bergner|  14|          5| 4.206779991551889|              1146401|
|106416780|  31501001|   NULL|   5|          5|5.5511362181719965|                    0|
|106416780|   2500141|samsung|   5|          5| 6.167768031576373|                  223|
|106416780|   2501450|samsung|   5|          5|6.1124424736608844|                  286|
|106416780|   2501450|samsung|   5|          5|6.1124424736608844|                  328|
|117019800|  35000021