In [0]:
df_oct = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

In [0]:
from pyspark.sql import functions as F

df_oct = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

df_oct.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
df_oct.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("bronze_events_oct")

In [0]:
events = spark.table("bronze_events_oct")

In [0]:
events_clean = (
    events
    .filter(F.col("user_id").isNotNull())
    .filter(F.col("price").isNotNull())
    .dropDuplicates()
)

In [0]:
features_df = events_clean.groupBy("user_id").agg(
    F.count("*").alias("total_events"),
    F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("total_purchases"),
    F.sum("price").alias("total_spent"),
    F.avg("price").alias("avg_price"),
    F.max("event_time").alias("last_activity")
)

In [0]:
features_df = features_df.withColumn(
    "purchase_ratio",
    F.col("total_purchases") / F.col("total_events")
)

In [0]:
features_df.groupBy("user_id").count().filter("count > 1").show()

+-------+-----+
|user_id|count|
+-------+-----+
+-------+-----+



In [0]:
features_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_user_features_oct")

In [0]:
spark.sql("DESCRIBE DETAIL silver_user_features_oct").show(truncate=False)

+------+------------------------------------+------------------------------------------+-----------+--------+-----------------------+-------------------+----------------+-----------------+--------+-----------+------------------------------------------------------------------------------+----------------+----------------+-----------------------------------------+---------------------------------------------------------------+-------------+
|format|id                                  |name                                      |description|location|createdAt              |lastModified       |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties                                                                    |minReaderVersion|minWriterVersion|tableFeatures                            |statistics                                                     |clusterByAuto|
+------+------------------------------------+------------------------------------------+-----------+--------+-----

In [0]:
features_df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in features_df.columns
]).show()

+-------+------------+---------------+-----------+---------+-------------+--------------+
|user_id|total_events|total_purchases|total_spent|avg_price|last_activity|purchase_ratio|
+-------+------------+---------------+-----------+---------+-------------+--------------+
|      0|           0|              0|          0|        0|            0|             0|
+-------+------------+---------------+-----------+---------+-------------+--------------+



In [0]:
features_df.describe().show()

+-------+--------------------+-----------------+------------------+------------------+------------------+--------------------+
|summary|             user_id|     total_events|   total_purchases|       total_spent|         avg_price|      purchase_ratio|
+-------+--------------------+-----------------+------------------+------------------+------------------+--------------------+
|  count|             3022290|          3022290|           3022290|           3022290|           3022290|             3022290|
|   mean| 5.404673750553795E8| 14.0352328863213|0.2457649663003881| 4074.614326619929|316.22048039931764|0.013012385345514358|
| stddev|1.9471434388507392E7|32.75705325743546|1.4093212679874434|11221.045708482596| 328.7386552884056|0.049632433488831354|
|    min|            33869381|                1|                 0|               0.0|               0.0|                 0.0|
|    max|           566280860|             7436|               321| 1993636.409999998|           2574.07|      

In [0]:
features_df.orderBy(F.col("total_spent").desc()).show(5)

+---------+------------+---------------+------------------+------------------+-------------------+-------------------+
|  user_id|total_events|total_purchases|       total_spent|         avg_price|      last_activity|     purchase_ratio|
+---------+------------+---------------+------------------+------------------+-------------------+-------------------+
|563459593|        1950|              0| 1993636.409999998|1022.3776461538451|2019-10-31 18:47:37|                0.0|
|536399452|        1803|              0|1692370.9200000013| 938.6416638935116|2019-10-31 15:00:55|                0.0|
|512365995|        4013|              0| 1499595.880000002|373.68449538998306|2019-10-31 16:44:33|                0.0|
|561163588|        1444|              0|1451590.7799999982|1005.2567728531843|2019-10-23 19:32:57|                0.0|
|545925192|        1420|            115|1265846.7000000074| 891.4413380281742|2019-10-31 19:49:27|0.08098591549295775|
+---------+------------+---------------+--------