In [0]:
# Load your data
oct_events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)

In [0]:
oct_events.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
oct_events.show(5, truncate=False)

+-------------------+----------+----------+-------------------+-----------------------------------+--------+-------+---------+------------------------------------+
|event_time         |event_type|product_id|category_id        |category_code                      |brand   |price  |user_id  |user_session                        |
+-------------------+----------+----------+-------------------+-----------------------------------+--------+-------+---------+------------------------------------+
|2019-10-01 00:00:00|view      |44600062  |2103807459595387724|NULL                               |shiseido|35.79  |541312140|72d76fde-8bb3-4e00-8c23-a032dfed738c|
|2019-10-01 00:00:00|view      |3900821   |2053013552326770905|appliances.environment.water_heater|aqua    |33.2   |554748717|9333dfbd-b87a-4708-9857-6336556b0fcc|
|2019-10-01 00:00:01|view      |17200506  |2053013559792632471|furniture.living_room.sofa         |NULL    |543.1  |519107250|566511c2-e2e3-422b-b695-cf8e6e792ca8|
|2019-10-01 00:0

In [0]:
# selecting few columns
oct_events.select("event_time", "event_type", "product_id", "price").show(10)

+-------------------+----------+----------+-------+
|         event_time|event_type|product_id|  price|
+-------------------+----------+----------+-------+
|2019-10-01 00:00:00|      view|  44600062|  35.79|
|2019-10-01 00:00:00|      view|   3900821|   33.2|
|2019-10-01 00:00:01|      view|  17200506|  543.1|
|2019-10-01 00:00:01|      view|   1307067| 251.74|
|2019-10-01 00:00:04|      view|   1004237|1081.98|
|2019-10-01 00:00:05|      view|   1480613| 908.62|
|2019-10-01 00:00:08|      view|  17300353| 380.96|
|2019-10-01 00:00:08|      view|  31500053|  41.16|
|2019-10-01 00:00:10|      view|  28719074| 102.71|
|2019-10-01 00:00:11|      view|   1004545| 566.01|
+-------------------+----------+----------+-------+
only showing top 10 rows


In [0]:
# Price greater than 100
oct_events.filter("price > 100").select("event_type", "product_id", "price").show(10)

+----------+----------+-------+
|event_type|product_id|  price|
+----------+----------+-------+
|      view|  17200506|  543.1|
|      view|   1307067| 251.74|
|      view|   1004237|1081.98|
|      view|   1480613| 908.62|
|      view|  17300353| 380.96|
|      view|  28719074| 102.71|
|      view|   1004545| 566.01|
|      view|   1005011| 900.64|
|      view|   3900746| 102.38|
|      view|  23100006| 357.79|
+----------+----------+-------+
only showing top 10 rows


In [0]:
from pyspark.sql.functions import count
oct_events.groupBy("event_type").agg(count("*").alias("cnt")).show()

+----------+--------+
|event_type|     cnt|
+----------+--------+
|  purchase|  742849|
|      cart|  926516|
|      view|40779399|
+----------+--------+



In [0]:
oct_events.groupBy("brand").agg(count("*").alias("cnt")) \
      .orderBy("cnt", ascending=False) \
      .show(10)

+-------+-------+
|  brand|    cnt|
+-------+-------+
|   NULL|6113008|
|samsung|5282775|
|  apple|4122554|
| xiaomi|3083763|
| huawei|1111205|
|lucente| 655861|
|     lg| 562404|
|  bosch| 557090|
|   oppo| 482887|
|   sony| 456644|
+-------+-------+
only showing top 10 rows


In [0]:
# Exporting results
csv_path = "/Volumes/workspace/ecommerce/ecommerce_data/top_brands_oct"
top_brands_df = spark.read.csv(csv_path, header=True, inferSchema=True)
top_brands_df.write.mode("overwrite").saveAsTable("workspace.ecommerce.top_brands_oct")

In [0]:
%sql
SELECT brand, count AS cnt
FROM workspace.ecommerce.top_brands_oct
ORDER BY cnt DESC;

brand,cnt
,6113008
samsung,5282775
apple,4122554
xiaomi,3083763
huawei,1111205
lucente,655861
lg,562404
bosch,557090
oppo,482887
sony,456644
