In [0]:
df_nov = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header = True, inferSchema = True)

In [0]:
# Basic operations after Loading data
df_nov.show(10)

+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL|   creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.s

In [0]:
# Using Select condition
df_nov.select("event_type", "brand", "price").show(10)

+----------+--------+------+
|event_type|   brand| price|
+----------+--------+------+
|      view|  xiaomi|489.07|
|      view|  janome|293.65|
|      view|   creed| 28.31|
|      view|      lg|712.87|
|      view|  xiaomi|183.27|
|      view|      hp|360.09|
|      view|      hp|514.56|
|      view| rondell| 30.86|
|      view|michelin| 72.72|
|      view|   apple|732.07|
+----------+--------+------+
only showing top 10 rows


In [0]:
# Using Filter condition
df_nov.filter("price > 100").count()

44549993

In [0]:
# Using groupby operator
df_nov.groupBy("event_type").count().show()

+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  916939|
|      cart| 3028930|
|      view|63556110|
+----------+--------+



In [0]:
# Using group by and order by and assigning it to a new dataframe
top_brands = df_nov.groupBy("brand").count().orderBy("count", ascending=False)

In [0]:
# Display the new dataframe
top_brands.show()

+--------+-------+
|   brand|  count|
+--------+-------+
|    NULL|9218235|
| samsung|7889245|
|   apple|6259379|
|  xiaomi|4638062|
|  huawei|1410126|
| lucente|1185075|
|      lg|1096990|
|   bosch| 975059|
|    oppo| 811698|
|    sony| 798457|
| respect| 765331|
|  lenovo| 727390|
|    acer| 698976|
|cordiant| 671671|
|   artel| 664281|
|      hp| 515352|
|    asus| 469241|
| redmond| 462034|
| philips| 456155|
| indesit| 428863|
+--------+-------+
only showing top 20 rows


In [0]:
# Window function
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
from pyspark.sql.functions import col
window_spec = Window.partitionBy("brand").orderBy(col("price").desc())

df_nov.withColumn("rank", rank().over(window_spec)).show()

+-------------------+----------+----------+-------------------+-------------+------+-----+---------+--------------------+----+
|         event_time|event_type|product_id|        category_id|category_code| brand|price|  user_id|        user_session|rank|
+-------------------+----------+----------+-------------------+-------------+------+-----+---------+--------------------+----+
|2019-11-03 07:07:01|      view|  15200062|2053013553484398879|         NULL|a-mega|218.8|519074314|ce183dfe-29a9-4a5...|   1|
|2019-11-03 07:35:31|      view|  15200062|2053013553484398879|         NULL|a-mega|218.8|546058653|50baeb20-c5d7-4c4...|   1|
|2019-11-03 10:55:47|      view|  15200062|2053013553484398879|         NULL|a-mega|218.8|553722250|b2cdd99f-03b6-42f...|   1|
|2019-11-03 10:57:10|      view|  15200062|2053013553484398879|         NULL|a-mega|218.8|553722250|b2cdd99f-03b6-42f...|   1|
|2019-11-06 11:13:12|      view|  15200062|2053013553484398879|         NULL|a-mega|218.8|532945915|06c37fde-c3