In [7]:
from db_tools.setup import setup
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,FloatType,DecimalType

In [3]:
spark = setup()

23/12/30 12:42:16 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/12/30 12:42:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/12/30 12:42:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [4]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

In [5]:
product_group_schema = StructType([ \
    StructField("group_id",IntegerType(),True), \
    StructField("group_name",StringType(),True)
  ])

product_schema = StructType([ \
    StructField("product_id",IntegerType(),True), \
    StructField("product_name",StringType(),True),\
    StructField("group_id",IntegerType(),True),\
    StructField("price",FloatType(),True), \
  ])

product_groups =   [(1,'Smartphone'),
                    (2,'Laptop'),
                    (3,'Tablet')]
products = [(1,'Microsoft Lumia', 1, 200.00),
            (2,'HTC One', 1, 400.00),
        	(3,'Nexus', 1, 500.00),
        	(4,'iPhone', 1, 900.00),
        	(5,'HP Elite', 2, 1200.00),
        	(6,'Lenovo Thinkpad', 2, 700.00),
        	(7,'Sony VAIO', 2, 700.00),
        	(8,'Dell Vostro', 2, 800.00),
        	(9,'iPad', 3, 700.00),
        	(10,'Kindle Fire', 3, 150.00),
        	(11,'Samsung Galaxy Tab', 3, 200.00)]
rdd = spark.sparkContext.parallelize(product_groups)
spark.createDataFrame(rdd, schema = product_group_schema).createOrReplaceTempView('product_groups')
rdd = spark.sparkContext.parallelize(products)
spark.createDataFrame(rdd, schema = product_schema).createOrReplaceTempView('products')

```
SELECT
	product_name,
	price,
	group_name,
	AVG (price) OVER (
	   PARTITION BY group_name
	) AS avg_price
FROM
	products
	INNER JOIN 
		product_groups USING (group_id);
```

In [11]:
df1 = spark.sql('SELECT * FROM products')
df2 = spark.sql('SELECT * FROM product_groups')

# First Example: fixed window

In [35]:
windowSpec = Window\
.partitionBy("group_name")
avg_price = F.avg(F.col('price')).over(windowSpec)#.alias('avg_price')

In [36]:
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',avg_price).show()

+------------------+------+----------+--------------------------------------------------------------------------------------------------+
|      product_name| price|group_name|avg(price) OVER (PARTITION BY group_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)|
+------------------+------+----------+--------------------------------------------------------------------------------------------------+
|          HP Elite|1200.0|    Laptop|                                                                                             850.0|
|   Lenovo Thinkpad| 700.0|    Laptop|                                                                                             850.0|
|         Sony VAIO| 700.0|    Laptop|                                                                                             850.0|
|       Dell Vostro| 800.0|    Laptop|                                                                                             850.0|
|   Microsoft Lumia| 200.0|Smartph

                                                                                

Note that for this example the window is defined by the number of elements in a group_name. So, for laptop the mean is
(1200+700+700+800)/4 = 850.00

## Be Careful when specifying the window

In [29]:
windowSpec = Window\
.partitionBy("group_name")\
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
avg_price = F.avg(F.col('price')).over(windowSpec).alias('avg_price')

In [30]:
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',avg_price).show()

+------------------+------+----------+-----------------+
|      product_name| price|group_name|        avg_price|
+------------------+------+----------+-----------------+
|          HP Elite|1200.0|    Laptop|           1200.0|
|   Lenovo Thinkpad| 700.0|    Laptop|            950.0|
|         Sony VAIO| 700.0|    Laptop|866.6666666666666|
|       Dell Vostro| 800.0|    Laptop|            850.0|
|   Microsoft Lumia| 200.0|Smartphone|            200.0|
|           HTC One| 400.0|Smartphone|            300.0|
|             Nexus| 500.0|Smartphone|366.6666666666667|
|            iPhone| 900.0|Smartphone|            500.0|
|              iPad| 700.0|    Tablet|            700.0|
|       Kindle Fire| 150.0|    Tablet|            425.0|
|Samsung Galaxy Tab| 200.0|    Tablet|            350.0|
+------------------+------+----------+-----------------+



For the second example, note that the window is expanding from the first group_name found until the last.
Because of that, the first avg_price is 1200, since there is only one product in the window at this time. Then, it
becomes 950.00, since only Lenovo Thinkpad joins the window. At last, note that the mean becomes 850.00, the answer for
all the average prices in the answer above, because only at the end all the products join the window

## Other possibility: contracting window

In [39]:
windowSpec = Window\
.partitionBy("group_name")\
.rowsBetween(Window.currentRow, Window.unboundedFollowing)
avg_price = F.avg(F.col('price')).over(windowSpec).alias('avg_price')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',avg_price).show()

+------------------+------+----------+-----------------+
|      product_name| price|group_name|        avg_price|
+------------------+------+----------+-----------------+
|          HP Elite|1200.0|    Laptop|            850.0|
|   Lenovo Thinkpad| 700.0|    Laptop|733.3333333333334|
|         Sony VAIO| 700.0|    Laptop|            750.0|
|       Dell Vostro| 800.0|    Laptop|            800.0|
|   Microsoft Lumia| 200.0|Smartphone|            500.0|
|           HTC One| 400.0|Smartphone|            600.0|
|             Nexus| 500.0|Smartphone|            700.0|
|            iPhone| 900.0|Smartphone|            900.0|
|              iPad| 700.0|    Tablet|            350.0|
|       Kindle Fire| 150.0|    Tablet|            175.0|
|Samsung Galaxy Tab| 200.0|    Tablet|            200.0|
+------------------+------+----------+-----------------+



For the third example, note that the window is contracting, since it starts with 850.00, the average for all items for the laptop group, 
going to 733.33, 750 and finally 800, the are respectively the averages by removing the first, second and third item. 

# Expliciting the Example 1 effect in rows between

In [37]:
windowSpec = Window\
.partitionBy("group_name")\
.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
avg_price = F.avg(F.col('price')).over(windowSpec).alias('avg_price')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',avg_price).show()

+------------------+------+----------+---------+
|      product_name| price|group_name|avg_price|
+------------------+------+----------+---------+
|          HP Elite|1200.0|    Laptop|    850.0|
|   Lenovo Thinkpad| 700.0|    Laptop|    850.0|
|         Sony VAIO| 700.0|    Laptop|    850.0|
|       Dell Vostro| 800.0|    Laptop|    850.0|
|   Microsoft Lumia| 200.0|Smartphone|    500.0|
|           HTC One| 400.0|Smartphone|    500.0|
|             Nexus| 500.0|Smartphone|    500.0|
|            iPhone| 900.0|Smartphone|    500.0|
|              iPad| 700.0|    Tablet|    350.0|
|       Kindle Fire| 150.0|    Tablet|    350.0|
|Samsung Galaxy Tab| 200.0|    Tablet|    350.0|
+------------------+------+----------+---------+



# Example 2

In [49]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
row_number = F.row_number().over(windowSpec).alias('row_number_call')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',row_number).show()

+------------------+------+----------+---------------+
|      product_name| price|group_name|row_number_call|
+------------------+------+----------+---------------+
|   Lenovo Thinkpad| 700.0|    Laptop|              1|
|         Sony VAIO| 700.0|    Laptop|              2|
|       Dell Vostro| 800.0|    Laptop|              3|
|          HP Elite|1200.0|    Laptop|              4|
|   Microsoft Lumia| 200.0|Smartphone|              1|
|           HTC One| 400.0|Smartphone|              2|
|             Nexus| 500.0|Smartphone|              3|
|            iPhone| 900.0|Smartphone|              4|
|       Kindle Fire| 150.0|    Tablet|              1|
|Samsung Galaxy Tab| 200.0|    Tablet|              2|
|              iPad| 700.0|    Tablet|              3|
+------------------+------+----------+---------------+



## Expliciting the rows between relation

In [51]:
windowSpec = Window\
.partitionBy("group_name").rowsBetween(Window.unboundedPreceding, Window.currentRow).orderBy('price')
row_number = F.row_number().over(windowSpec).alias('row_number_call')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',row_number).show()

+------------------+------+----------+---------------+
|      product_name| price|group_name|row_number_call|
+------------------+------+----------+---------------+
|   Lenovo Thinkpad| 700.0|    Laptop|              1|
|         Sony VAIO| 700.0|    Laptop|              2|
|       Dell Vostro| 800.0|    Laptop|              3|
|          HP Elite|1200.0|    Laptop|              4|
|   Microsoft Lumia| 200.0|Smartphone|              1|
|           HTC One| 400.0|Smartphone|              2|
|             Nexus| 500.0|Smartphone|              3|
|            iPhone| 900.0|Smartphone|              4|
|       Kindle Fire| 150.0|    Tablet|              1|
|Samsung Galaxy Tab| 200.0|    Tablet|              2|
|              iPad| 700.0|    Tablet|              3|
+------------------+------+----------+---------------+



# RANK

In [54]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
rank = F.rank().over(windowSpec).alias('price_rank')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',rank).orderBy('price_rank',ascending = False).show()

+------------------+------+----------+----------+
|      product_name| price|group_name|price_rank|
+------------------+------+----------+----------+
|          HP Elite|1200.0|    Laptop|         4|
|            iPhone| 900.0|Smartphone|         4|
|       Dell Vostro| 800.0|    Laptop|         3|
|             Nexus| 500.0|Smartphone|         3|
|              iPad| 700.0|    Tablet|         3|
|           HTC One| 400.0|Smartphone|         2|
|Samsung Galaxy Tab| 200.0|    Tablet|         2|
|   Lenovo Thinkpad| 700.0|    Laptop|         1|
|         Sony VAIO| 700.0|    Laptop|         1|
|   Microsoft Lumia| 200.0|Smartphone|         1|
|       Kindle Fire| 150.0|    Tablet|         1|
+------------------+------+----------+----------+



# DENSE_RANK

In [58]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
rank = F.dense_rank().over(windowSpec).alias('price_rank')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',rank).show()

+------------------+------+----------+----------+
|      product_name| price|group_name|price_rank|
+------------------+------+----------+----------+
|   Lenovo Thinkpad| 700.0|    Laptop|         1|
|         Sony VAIO| 700.0|    Laptop|         1|
|       Dell Vostro| 800.0|    Laptop|         2|
|          HP Elite|1200.0|    Laptop|         3|
|   Microsoft Lumia| 200.0|Smartphone|         1|
|           HTC One| 400.0|Smartphone|         2|
|             Nexus| 500.0|Smartphone|         3|
|            iPhone| 900.0|Smartphone|         4|
|       Kindle Fire| 150.0|    Tablet|         1|
|Samsung Galaxy Tab| 200.0|    Tablet|         2|
|              iPad| 700.0|    Tablet|         3|
+------------------+------+----------+----------+



# FIRST_VALUE and LAST_VALUE - Not implemented on pyspark 3.4.1 API, only in 3.5.0  - [reference](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.first_value.html)

In [59]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
first_value = F.first_value().over(windowSpec).alias('lowest_price_per_group')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',first_value).show()

AttributeError: module 'pyspark.sql.functions' has no attribute 'first_value'

# LAG

In [64]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
lag = F.lag('price',1).over(windowSpec).alias('prev_price')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',lag,(F.col('price')-F.col('prev_price')).alias('price_difference')).show()

+------------------+------+----------+----------+----------------+
|      product_name| price|group_name|prev_price|price_difference|
+------------------+------+----------+----------+----------------+
|   Lenovo Thinkpad| 700.0|    Laptop|      null|            null|
|         Sony VAIO| 700.0|    Laptop|     700.0|             0.0|
|       Dell Vostro| 800.0|    Laptop|     700.0|           100.0|
|          HP Elite|1200.0|    Laptop|     800.0|           400.0|
|   Microsoft Lumia| 200.0|Smartphone|      null|            null|
|           HTC One| 400.0|Smartphone|     200.0|           200.0|
|             Nexus| 500.0|Smartphone|     400.0|           100.0|
|            iPhone| 900.0|Smartphone|     500.0|           400.0|
|       Kindle Fire| 150.0|    Tablet|      null|            null|
|Samsung Galaxy Tab| 200.0|    Tablet|     150.0|            50.0|
|              iPad| 700.0|    Tablet|     200.0|           500.0|
+------------------+------+----------+----------+-------------

# LAG with coalesce

In [67]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
lag = F.lag('price',1).over(windowSpec).alias('prev_price')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',F.coalesce(lag,F.lit(0.0)).alias('prev_price'),F.coalesce(F.col('price')-F.col('prev_price'),F.lit(0.0)).alias('price_difference')).show()

+------------------+------+----------+----------+----------------+
|      product_name| price|group_name|prev_price|price_difference|
+------------------+------+----------+----------+----------------+
|   Lenovo Thinkpad| 700.0|    Laptop|       0.0|           700.0|
|         Sony VAIO| 700.0|    Laptop|     700.0|             0.0|
|       Dell Vostro| 800.0|    Laptop|     700.0|           100.0|
|          HP Elite|1200.0|    Laptop|     800.0|           400.0|
|   Microsoft Lumia| 200.0|Smartphone|       0.0|           200.0|
|           HTC One| 400.0|Smartphone|     200.0|           200.0|
|             Nexus| 500.0|Smartphone|     400.0|           100.0|
|            iPhone| 900.0|Smartphone|     500.0|           400.0|
|       Kindle Fire| 150.0|    Tablet|       0.0|           150.0|
|Samsung Galaxy Tab| 200.0|    Tablet|     150.0|            50.0|
|              iPad| 700.0|    Tablet|     200.0|           500.0|
+------------------+------+----------+----------+-------------

# LEAD

In [68]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
lead = F.lead('price',1).over(windowSpec).alias('prev_price')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',lead,(F.col('price')-F.col('prev_price')).alias('price_difference')).show()

+------------------+------+----------+----------+----------------+
|      product_name| price|group_name|prev_price|price_difference|
+------------------+------+----------+----------+----------------+
|   Lenovo Thinkpad| 700.0|    Laptop|     700.0|             0.0|
|         Sony VAIO| 700.0|    Laptop|     800.0|          -100.0|
|       Dell Vostro| 800.0|    Laptop|    1200.0|          -400.0|
|          HP Elite|1200.0|    Laptop|      null|            null|
|   Microsoft Lumia| 200.0|Smartphone|     400.0|          -200.0|
|           HTC One| 400.0|Smartphone|     500.0|          -100.0|
|             Nexus| 500.0|Smartphone|     900.0|          -400.0|
|            iPhone| 900.0|Smartphone|      null|            null|
|       Kindle Fire| 150.0|    Tablet|     200.0|           -50.0|
|Samsung Galaxy Tab| 200.0|    Tablet|     700.0|          -500.0|
|              iPad| 700.0|    Tablet|      null|            null|
+------------------+------+----------+----------+-------------

# LEAD with COALESCE

In [69]:
windowSpec = Window\
.partitionBy("group_name").orderBy('price')
lead = F.lead('price',1).over(windowSpec).alias('prev_price')
df1.join(df2,how = 'inner',on = 'group_id').select('product_name','price','group_name',F.coalesce(lead,F.lit(0.0)).alias('prev_price'),F.coalesce(F.col('price')-F.col('prev_price'),F.lit(0.0)).alias('price_difference')).show()

+------------------+------+----------+----------+----------------+
|      product_name| price|group_name|prev_price|price_difference|
+------------------+------+----------+----------+----------------+
|   Lenovo Thinkpad| 700.0|    Laptop|     700.0|             0.0|
|         Sony VAIO| 700.0|    Laptop|     800.0|          -100.0|
|       Dell Vostro| 800.0|    Laptop|    1200.0|          -400.0|
|          HP Elite|1200.0|    Laptop|       0.0|          1200.0|
|   Microsoft Lumia| 200.0|Smartphone|     400.0|          -200.0|
|           HTC One| 400.0|Smartphone|     500.0|          -100.0|
|             Nexus| 500.0|Smartphone|     900.0|          -400.0|
|            iPhone| 900.0|Smartphone|       0.0|           900.0|
|       Kindle Fire| 150.0|    Tablet|     200.0|           -50.0|
|Samsung Galaxy Tab| 200.0|    Tablet|     700.0|          -500.0|
|              iPad| 700.0|    Tablet|       0.0|           700.0|
+------------------+------+----------+----------+-------------

# CUME_DIST

In [70]:
sale_stats_schema = StructType([ \
    StructField("name",StringType(),True), \
    StructField("year",IntegerType(),True),\
    StructField("amount",IntegerType(),True),\
  ])

sale_stats_data =   [('John Doe',2018,120000),
                    ('Jane Doe',2018,110000),
                    ('Jack Daniel',2018,150000),
                    ('Yin Yang',2018,30000),
                    ('Stephane Heady',2018,200000),
                    ('John Doe',2019,150000),
                    ('Jane Doe',2019,130000),
                    ('Jack Daniel',2019,180000),
                    ('Yin Yang',2019,25000),
                    ('Stephane Heady',2019,270000)]

rdd = spark.sparkContext.parallelize(sale_stats_data)
spark.createDataFrame(rdd, schema = sale_stats_schema).createOrReplaceTempView('sales_stats')

In [71]:
df = spark.sql('SELECT * FROM sales_stats')

## Without partition (BAD PRACTICE!)

## Order matters when applying a filter and windowing

## Always bear in mind the following

<center><img width=250 src="https://global.discourse-cdn.com/codecademy/original/5X/8/4/f/1/84f173795dc109601315a9ebe158a9c102f88337.png"/></center>

In [129]:
windowSpec =  Window.orderBy('amount')
cume_dist = F.cume_dist().over(windowSpec).alias('ECDF')
df.select('name','year','amount',cume_dist).filter(F.col('year') == 2018).show()

23/12/30 14:51:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:51:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:51:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------+----+------+----+
|          name|year|amount|ECDF|
+--------------+----+------+----+
|      Yin Yang|2018| 30000| 0.2|
|      Jane Doe|2018|110000| 0.3|
|      John Doe|2018|120000| 0.4|
|   Jack Daniel|2018|150000| 0.7|
|Stephane Heady|2018|200000| 0.9|
+--------------+----+------+----+



23/12/30 14:51:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:51:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [127]:
query = """SELECT 
    name,
    year, 
    amount,
    CUME_DIST() OVER (
        ORDER BY amount
    ) AS ECDF
FROM 
    sales_stats
WHERE 
    year = 2018;
"""
spark.sql(query).show()#.explain()

23/12/30 14:50:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:50:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:50:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------+----+------+----+
|          name|year|amount|ECDF|
+--------------+----+------+----+
|      Yin Yang|2018| 30000| 0.2|
|      Jane Doe|2018|110000| 0.4|
|      John Doe|2018|120000| 0.6|
|   Jack Daniel|2018|150000| 0.8|
|Stephane Heady|2018|200000| 1.0|
+--------------+----+------+----+



23/12/30 14:50:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:50:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


### Let us see what went wrong here...

In [128]:
query = """SELECT 
    name,
    year, 
    amount,
    CUME_DIST() OVER (
        ORDER BY amount
    ) AS ECDF
FROM 
    sales_stats
WHERE 
    year = 2018;
"""
spark.sql(query).explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Window [cume_dist() windowspecdefinition(amount#1309 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS ECDF#1817], [amount#1309 ASC NULLS FIRST]
   +- Sort [amount#1309 ASC NULLS FIRST], false, 0
      +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=7970]
         +- Filter (isnotnull(year#1308) AND (year#1308 = 2018))
            +- Scan ExistingRDD[name#1307,year#1308,amount#1309]




23/12/30 14:51:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:51:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:51:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [130]:
windowSpec =  Window.orderBy('amount')
cume_dist = F.cume_dist().over(windowSpec).alias('ECDF')
df.select('name','year','amount',cume_dist).filter(F.col('year') == 2018).explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(year#1308) AND (year#1308 = 2018))
   +- Window [cume_dist() windowspecdefinition(amount#1309 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS ECDF#1847], [amount#1309 ASC NULLS FIRST]
      +- Sort [amount#1309 ASC NULLS FIRST], false, 0
         +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=8044]
            +- Scan ExistingRDD[name#1307,year#1308,amount#1309]




23/12/30 14:52:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:52:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:52:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


The order is different from the query! Let's use the filter first and check what happens!

In [131]:
df.filter(F.col('year') == 2018).select('name','year','amount',cume_dist).explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Window [cume_dist() windowspecdefinition(amount#1309 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS ECDF#1847], [amount#1309 ASC NULLS FIRST]
   +- Sort [amount#1309 ASC NULLS FIRST], false, 0
      +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=8060]
         +- Filter (isnotnull(year#1308) AND (year#1308 = 2018))
            +- Scan ExistingRDD[name#1307,year#1308,amount#1309]




23/12/30 14:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:53:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Now the Plan its equal to the SQL method. Let's do a final check

In [132]:
df.filter(F.col('year') == 2018).select('name','year','amount',cume_dist).show()

23/12/30 14:54:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:54:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:54:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------+----+------+----+
|          name|year|amount|ECDF|
+--------------+----+------+----+
|      Yin Yang|2018| 30000| 0.2|
|      Jane Doe|2018|110000| 0.4|
|      John Doe|2018|120000| 0.6|
|   Jack Daniel|2018|150000| 0.8|
|Stephane Heady|2018|200000| 1.0|
+--------------+----+------+----+



23/12/30 14:54:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 14:54:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


## With partition

In [110]:
windowSpec =  Window.partitionBy('year').orderBy('amount')#.rangeBetween(Window.unboundedPreceding, Window.currentRow)
cume_dist = F.cume_dist().over(windowSpec).alias('ECDF')
df.select('name','year','amount',cume_dist).show()

+--------------+----+------+----+
|          name|year|amount|ECDF|
+--------------+----+------+----+
|      Yin Yang|2018| 30000| 0.2|
|      Jane Doe|2018|110000| 0.4|
|      John Doe|2018|120000| 0.6|
|   Jack Daniel|2018|150000| 0.8|
|Stephane Heady|2018|200000| 1.0|
|      Yin Yang|2019| 25000| 0.2|
|      Jane Doe|2019|130000| 0.4|
|      John Doe|2019|150000| 0.6|
|   Jack Daniel|2019|180000| 0.8|
|Stephane Heady|2019|270000| 1.0|
+--------------+----+------+----+



# NTH_VALUE

## Without partition (BAD PRACTICE, without a good reason, of course)

In [144]:
windowSpec =  Window.orderBy(F.desc('price')).rangeBetween(Window.unboundedPreceding,Window.unboundedFollowing)
nth_value = F.nth_value('product_name',2).over(windowSpec).alias('nth_value_call')
df1.select('product_id',
           'product_name',
           'price',
            nth_value).show()

23/12/30 17:28:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 17:28:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 17:28:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+------+--------------+
|product_id|      product_name| price|nth_value_call|
+----------+------------------+------+--------------+
|         5|          HP Elite|1200.0|        iPhone|
|         4|            iPhone| 900.0|        iPhone|
|         8|       Dell Vostro| 800.0|        iPhone|
|         6|   Lenovo Thinkpad| 700.0|        iPhone|
|         7|         Sony VAIO| 700.0|        iPhone|
|         9|              iPad| 700.0|        iPhone|
|         3|             Nexus| 500.0|        iPhone|
|         2|           HTC One| 400.0|        iPhone|
|         1|   Microsoft Lumia| 200.0|        iPhone|
|        11|Samsung Galaxy Tab| 200.0|        iPhone|
|        10|       Kindle Fire| 150.0|        iPhone|
+----------+------------------+------+--------------+



23/12/30 17:28:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 17:28:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [147]:
windowSpec =  Window.partitionBy('group_id').orderBy(F.desc('price')).rangeBetween(Window.unboundedPreceding,Window.unboundedFollowing)
nth_value = F.nth_value('product_name',2).over(windowSpec).alias('nth_value_call')
df1.select('product_id',
           'product_name',
           'group_id',
           'price',
            nth_value).show()

+----------+------------------+--------+------+------------------+
|product_id|      product_name|group_id| price|    nth_value_call|
+----------+------------------+--------+------+------------------+
|         4|            iPhone|       1| 900.0|             Nexus|
|         3|             Nexus|       1| 500.0|             Nexus|
|         2|           HTC One|       1| 400.0|             Nexus|
|         1|   Microsoft Lumia|       1| 200.0|             Nexus|
|         5|          HP Elite|       2|1200.0|       Dell Vostro|
|         8|       Dell Vostro|       2| 800.0|       Dell Vostro|
|         6|   Lenovo Thinkpad|       2| 700.0|       Dell Vostro|
|         7|         Sony VAIO|       2| 700.0|       Dell Vostro|
|         9|              iPad|       3| 700.0|Samsung Galaxy Tab|
|        11|Samsung Galaxy Tab|       3| 200.0|Samsung Galaxy Tab|
|        10|       Kindle Fire|       3| 150.0|Samsung Galaxy Tab|
+----------+------------------+--------+------+---------------

In [149]:
windowSpec = Window.partitionBy('year').orderBy('amount')
perc_rank = F.percent_rank().over(windowSpec).alias('perc_rank_call')
df.select('name','amount',perc_rank).show()

+--------------+------+--------------+
|          name|amount|perc_rank_call|
+--------------+------+--------------+
|      Yin Yang| 30000|           0.0|
|      Jane Doe|110000|          0.25|
|      John Doe|120000|           0.5|
|   Jack Daniel|150000|          0.75|
|Stephane Heady|200000|           1.0|
|      Yin Yang| 25000|           0.0|
|      Jane Doe|130000|          0.25|
|      John Doe|150000|           0.5|
|   Jack Daniel|180000|          0.75|
|Stephane Heady|270000|           1.0|
+--------------+------+--------------+



# ROW_NUMBER

## without partition

In [152]:
windowSpec = Window.orderBy('product_id')
row_number = F.row_number().over(windowSpec).alias('row_number_call')
df1.select('product_id','product_name','group_id',row_number).show()

23/12/30 17:42:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 17:42:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 17:42:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+--------+---------------+
|product_id|      product_name|group_id|row_number_call|
+----------+------------------+--------+---------------+
|         1|   Microsoft Lumia|       1|              1|
|         2|           HTC One|       1|              2|
|         3|             Nexus|       1|              3|
|         4|            iPhone|       1|              4|
|         5|          HP Elite|       2|              5|
|         6|   Lenovo Thinkpad|       2|              6|
|         7|         Sony VAIO|       2|              7|
|         8|       Dell Vostro|       2|              8|
|         9|              iPad|       3|              9|
|        10|       Kindle Fire|       3|             10|
|        11|Samsung Galaxy Tab|       3|             11|
+----------+------------------+--------+---------------+



In [153]:
windowSpec = Window.orderBy('product_name')
row_number = F.row_number().over(windowSpec).alias('row_number_call')
df1.select('product_id','product_name','group_id',row_number).show()

23/12/30 17:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 17:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 17:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+--------+---------------+
|product_id|      product_name|group_id|row_number_call|
+----------+------------------+--------+---------------+
|         8|       Dell Vostro|       2|              1|
|         5|          HP Elite|       2|              2|
|         2|           HTC One|       1|              3|
|        10|       Kindle Fire|       3|              4|
|         6|   Lenovo Thinkpad|       2|              5|
|         1|   Microsoft Lumia|       1|              6|
|         3|             Nexus|       1|              7|
|        11|Samsung Galaxy Tab|       3|              8|
|         7|         Sony VAIO|       2|              9|
|         9|              iPad|       3|             10|
|         4|            iPhone|       1|             11|
+----------+------------------+--------+---------------+



## with partition

In [157]:
windowSpec = Window.partitionBy('group_id').orderBy('product_id')
row_number = F.row_number().over(windowSpec).alias('row_number_call')
df1.select('product_id','product_name','group_id',row_number).show()

+----------+------------------+--------+---------------+
|product_id|      product_name|group_id|row_number_call|
+----------+------------------+--------+---------------+
|         1|   Microsoft Lumia|       1|              1|
|         2|           HTC One|       1|              2|
|         3|             Nexus|       1|              3|
|         4|            iPhone|       1|              4|
|         5|          HP Elite|       2|              1|
|         6|   Lenovo Thinkpad|       2|              2|
|         7|         Sony VAIO|       2|              3|
|         8|       Dell Vostro|       2|              4|
|         9|              iPad|       3|              1|
|        10|       Kindle Fire|       3|              2|
|        11|Samsung Galaxy Tab|       3|              3|
+----------+------------------+--------+---------------+



## Row number and distinct

In [158]:
windowSpec = Window.orderBy('price')
row_number = F.row_number().over(windowSpec).alias('row_number_call')
df1.distinct().select('price',row_number).show()

23/12/30 18:35:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:35:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:35:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+---------------+
| price|row_number_call|
+------+---------------+
| 150.0|              1|
| 200.0|              2|
| 200.0|              3|
| 400.0|              4|
| 500.0|              5|
| 700.0|              6|
| 700.0|              7|
| 700.0|              8|
| 800.0|              9|
| 900.0|             10|
|1200.0|             11|
+------+---------------+



23/12/30 18:35:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:35:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:35:50 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


## Removing duplicates

In [160]:
df_aux = df1.distinct().select('price')
row_number = F.row_number().over(windowSpec).alias('row_number_call')
df_aux.distinct().select('price',row_number).show()

23/12/30 18:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+---------------+
| price|row_number_call|
+------+---------------+
| 150.0|              1|
| 200.0|              2|
| 400.0|              3|
| 500.0|              4|
| 700.0|              5|
| 800.0|              6|
| 900.0|              7|
|1200.0|              8|
+------+---------------+



23/12/30 18:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


# ROW_NUMBER as pagination

In [166]:
windowSpec = Window.orderBy('product_name')
row_number = F.row_number().over(windowSpec).alias('row_number')
df_sub = df1.select('product_id',
        			'product_name',
        			'price',
                     row_number)
df_final = df_sub.filter(F.col('row_number').between(6,10))
df_final.show()

23/12/30 18:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+-----+----------+
|product_id|      product_name|price|row_number|
+----------+------------------+-----+----------+
|         1|   Microsoft Lumia|200.0|         6|
|         3|             Nexus|500.0|         7|
|        11|Samsung Galaxy Tab|200.0|         8|
|         7|         Sony VAIO|700.0|         9|
|         9|              iPad|700.0|        10|
+----------+------------------+-----+----------+



23/12/30 18:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 18:50:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [193]:
windowSpec = Window.orderBy(F.col('price').desc())
row_number = F.row_number().over(windowSpec).alias('nth')
df_sub1 = df1.distinct().select('price')
df_sub2 = df_sub1.select('price',row_number).filter(F.col('nth') == 3)
df_sub3 = df_sub2.select('price')
df_sub4 = df1.filter(F.col('price') == F.lit(df_sub3.collect()[0][0]))
df_sub4.show()

23/12/30 19:02:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 19:02:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 19:02:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 19:02:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 19:02:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 19:02:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/30 1

+----------+------------+--------+-----+
|product_id|product_name|group_id|price|
+----------+------------+--------+-----+
|         8| Dell Vostro|       2|800.0|
+----------+------------+--------+-----+



200.0