In [0]:
# Loading dataset
df_nov = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header = True, inferSchema = True)

In [0]:
df_nov.show()

+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.s

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

# Top 5 products by revenue
revenue = df_nov.filter(F.col("event_type") == "purchase") \
    .groupBy("product_id","brand") \
    .agg(F.sum("price").alias("revenue")) \
    .orderBy(F.desc("revenue")).limit(5)


In [0]:
revenue.show()

+----------+-----+--------------------+
|product_id|brand|             revenue|
+----------+-----+--------------------+
|   1005115|apple|2.0625574319999937E7|
|   1005105|apple|1.1445354690000003E7|
|   1005135|apple|   7086522.129999991|
|   1004249|apple|          6815294.62|
|   1002544|apple|   5603193.590000003|
+----------+-----+--------------------+



In [0]:
# Running total per user
from pyspark.sql.window import Window
window = Window.partitionBy("user_id").orderBy("event_time")
df_nov.withColumn("cumulative_events", F.count("*").over(window))

 

DataFrame[event_time: timestamp, event_type: string, product_id: int, category_id: bigint, category_code: string, brand: string, price: double, user_id: int, user_session: string, cumulative_events: bigint]

In [0]:
# Conversion rate by category
cr = df_nov.groupBy("category_code") \
    .pivot("event_type") \
    .count() \
    .withColumn(
        "conversion_rate",
        F.when(F.col("view") > 0,
               F.col("purchase") / F.col("view") * 100)
         .otherwise(0)
    )
cr.show()     

+--------------------+-----+--------+-------+-------------------+
|       category_code| cart|purchase|   view|    conversion_rate|
+--------------------+-----+--------+-------+-------------------+
|furniture.living_...| 6521|    1562| 417428| 0.3741962685780542|
|      apparel.jumper|  324|      82|  31269|0.26224055774089355|
| stationery.cartrige|  644|     191|  11943| 1.5992631667085322|
|       sport.bicycle| 2227|     536| 106037| 0.5054839348529288|
|        apparel.sock|  101|      19|   3455| 0.5499276410998553|
|appliances.enviro...|   81|      32|   3316| 0.9650180940892641|
|          kids.swing| 1556|     482|  57430| 0.8392826049103257|
|auto.accessories....|   96|      18|   3397| 0.5298793052693553|
|auto.accessories....| 2213|     544|  47145| 1.1538869445328244|
|electronics.audio...| 1615|     489|  44645| 1.0953074252435884|
|  electronics.clocks|69289|   23237|1994440| 1.1650889472734203|
|electronics.audio...| 2280|     696|  60363|  1.153024203568411|
|appliance