In [45]:
from db_tools.setup import setup
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,FloatType,DecimalType
import pandas as pd

In [4]:
spark = setup()

23/12/27 19:17:51 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/12/27 19:17:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/12/27 19:17:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [5]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

## Creating the table example

In [6]:
product_group_schema = StructType([ \
    StructField("group_id",IntegerType(),True), \
    StructField("group_name",StringType(),True)
  ])

product_schema = StructType([ \
    StructField("product_id",IntegerType(),True), \
    StructField("product_name",StringType(),True),\
    StructField("group_id",IntegerType(),True),\
    StructField("price",FloatType(),True), \
  ])

product_groups =   [(1,'Smartphone'),
                    (2,'Laptop'),
                    (3,'Tablet')]
products = [(1,'Microsoft Lumia', 1, 200.00),
            (2,'HTC One', 1, 400.00),
        	(3,'Nexus', 1, 500.00),
        	(4,'iPhone', 1, 900.00),
        	(5,'HP Elite', 2, 1200.00),
        	(6,'Lenovo Thinkpad', 2, 700.00),
        	(7,'Sony VAIO', 2, 700.00),
        	(8,'Dell Vostro', 2, 800.00),
        	(9,'iPad', 3, 700.00),
        	(10,'Kindle Fire', 3, 150.00),
        	(11,'Samsung Galaxy Tab', 3, 200.00)]
rdd = spark.sparkContext.parallelize(product_groups)
spark.createDataFrame(rdd, schema = product_group_schema).createOrReplaceTempView('product_groups')
rdd = spark.sparkContext.parallelize(products)
spark.createDataFrame(rdd, schema = product_schema).createOrReplaceTempView('products')

## Aggregate functions review

In [7]:
query = """SELECT
	AVG (price)
FROM
	products;
"""
show_query(query)



+-----------------+
|       avg(price)|
+-----------------+
|586.3636363636364|
+-----------------+



                                                                                

In [8]:
query = """
SELECT
	group_name,
	AVG (price)
FROM
	products
INNER JOIN product_groups USING (group_id)
GROUP BY
	group_name;"""
show_query(query)



+----------+----------+
|group_name|avg(price)|
+----------+----------+
|    Laptop|     850.0|
|    Tablet|     350.0|
|Smartphone|     500.0|
+----------+----------+



                                                                                

## Basic window example: mean

In [9]:
query = """
SELECT
	product_name,
	price,
	group_name,
	AVG (price) OVER (
	   PARTITION BY group_name
	) AS avg_price
FROM
	products
	INNER JOIN 
		product_groups USING (group_id);
"""
show_query(query)



+------------------+------+----------+---------+
|      product_name| price|group_name|avg_price|
+------------------+------+----------+---------+
|          HP Elite|1200.0|    Laptop|    850.0|
|   Lenovo Thinkpad| 700.0|    Laptop|    850.0|
|         Sony VAIO| 700.0|    Laptop|    850.0|
|       Dell Vostro| 800.0|    Laptop|    850.0|
|   Microsoft Lumia| 200.0|Smartphone|    500.0|
|           HTC One| 400.0|Smartphone|    500.0|
|             Nexus| 500.0|Smartphone|    500.0|
|            iPhone| 900.0|Smartphone|    500.0|
|              iPad| 700.0|    Tablet|    350.0|
|       Kindle Fire| 150.0|    Tablet|    350.0|
|Samsung Galaxy Tab| 200.0|    Tablet|    350.0|
+------------------+------+----------+---------+



                                                                                

## Window functions: ROW_NUMBER

In [10]:
query = """
SELECT
	product_name,
	group_name,
	price,
	ROW_NUMBER () OVER (
		PARTITION BY group_name
		ORDER BY
			price
	)
FROM
	products
INNER JOIN product_groups USING (group_id);
"""
show_query(query)

+------------------+----------+------+---------------------------------------------------------------------------------------------------------------------------+
|      product_name|group_name| price|row_number() OVER (PARTITION BY group_name ORDER BY price ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)|
+------------------+----------+------+---------------------------------------------------------------------------------------------------------------------------+
|   Lenovo Thinkpad|    Laptop| 700.0|                                                                                                                          1|
|         Sony VAIO|    Laptop| 700.0|                                                                                                                          2|
|       Dell Vostro|    Laptop| 800.0|                                                                                                                          3|
|          HP Elite|  

## Window functions: RANK

In [11]:
query = '''
SELECT
	product_name,
	group_name,
  price,
	RANK () OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS price_rank
FROM
	products
INNER JOIN product_groups USING (group_id)
ORDER BY price_rank DESC;
'''
show_query(query)


+------------------+----------+------+----------+
|      product_name|group_name| price|price_rank|
+------------------+----------+------+----------+
|          HP Elite|    Laptop|1200.0|         4|
|            iPhone|Smartphone| 900.0|         4|
|       Dell Vostro|    Laptop| 800.0|         3|
|             Nexus|Smartphone| 500.0|         3|
|              iPad|    Tablet| 700.0|         3|
|           HTC One|Smartphone| 400.0|         2|
|Samsung Galaxy Tab|    Tablet| 200.0|         2|
|   Lenovo Thinkpad|    Laptop| 700.0|         1|
|         Sony VAIO|    Laptop| 700.0|         1|
|   Microsoft Lumia|Smartphone| 200.0|         1|
|       Kindle Fire|    Tablet| 150.0|         1|
+------------------+----------+------+----------+



## Window functions:DENSE_RANK

In [12]:
query = '''
SELECT
	product_name,
	group_name,
  price,
	DENSE_RANK () OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS price_rank
FROM
	products
INNER JOIN product_groups USING (group_id)
'''
show_query(query)


+------------------+----------+------+----------+
|      product_name|group_name| price|price_rank|
+------------------+----------+------+----------+
|   Lenovo Thinkpad|    Laptop| 700.0|         1|
|         Sony VAIO|    Laptop| 700.0|         1|
|       Dell Vostro|    Laptop| 800.0|         2|
|          HP Elite|    Laptop|1200.0|         3|
|   Microsoft Lumia|Smartphone| 200.0|         1|
|           HTC One|Smartphone| 400.0|         2|
|             Nexus|Smartphone| 500.0|         3|
|            iPhone|Smartphone| 900.0|         4|
|       Kindle Fire|    Tablet| 150.0|         1|
|Samsung Galaxy Tab|    Tablet| 200.0|         2|
|              iPad|    Tablet| 700.0|         3|
+------------------+----------+------+----------+



## Window functions: FIRST_VALUE

In [13]:
query = '''
SELECT
	product_name,
	group_name,
	price,
	FIRST_VALUE (price) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS lowest_price_per_group
FROM
	products
INNER JOIN product_groups USING (group_id);
'''
show_query(query)

+------------------+----------+------+----------------------+
|      product_name|group_name| price|lowest_price_per_group|
+------------------+----------+------+----------------------+
|   Lenovo Thinkpad|    Laptop| 700.0|                 700.0|
|         Sony VAIO|    Laptop| 700.0|                 700.0|
|       Dell Vostro|    Laptop| 800.0|                 700.0|
|          HP Elite|    Laptop|1200.0|                 700.0|
|   Microsoft Lumia|Smartphone| 200.0|                 200.0|
|           HTC One|Smartphone| 400.0|                 200.0|
|             Nexus|Smartphone| 500.0|                 200.0|
|            iPhone|Smartphone| 900.0|                 200.0|
|       Kindle Fire|    Tablet| 150.0|                 150.0|
|Samsung Galaxy Tab|    Tablet| 200.0|                 150.0|
|              iPad|    Tablet| 700.0|                 150.0|
+------------------+----------+------+----------------------+



## Window functions: LAST_VALUE

In [14]:
query = '''
SELECT
	product_name,
	group_name,
	price,
	LAST_VALUE (price) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS lowest_price_per_group
FROM
	products
INNER JOIN product_groups USING (group_id);
'''
show_query(query)

+------------------+----------+------+----------------------+
|      product_name|group_name| price|lowest_price_per_group|
+------------------+----------+------+----------------------+
|   Lenovo Thinkpad|    Laptop| 700.0|                 700.0|
|         Sony VAIO|    Laptop| 700.0|                 700.0|
|       Dell Vostro|    Laptop| 800.0|                 800.0|
|          HP Elite|    Laptop|1200.0|                1200.0|
|   Microsoft Lumia|Smartphone| 200.0|                 200.0|
|           HTC One|Smartphone| 400.0|                 400.0|
|             Nexus|Smartphone| 500.0|                 500.0|
|            iPhone|Smartphone| 900.0|                 900.0|
|       Kindle Fire|    Tablet| 150.0|                 150.0|
|Samsung Galaxy Tab|    Tablet| 200.0|                 200.0|
|              iPad|    Tablet| 700.0|                 700.0|
+------------------+----------+------+----------------------+



## Window functions: LAG

In [15]:
query = '''
SELECT
	product_name,
	group_name,
	price,
	LAG (price, 1) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS prev_price,
	price - LAG (price, 1) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS cur_prev_diff
FROM
	products
INNER JOIN product_groups USING (group_id);
'''
show_query(query)

+------------------+----------+------+----------+-------------+
|      product_name|group_name| price|prev_price|cur_prev_diff|
+------------------+----------+------+----------+-------------+
|   Lenovo Thinkpad|    Laptop| 700.0|      null|         null|
|         Sony VAIO|    Laptop| 700.0|     700.0|          0.0|
|       Dell Vostro|    Laptop| 800.0|     700.0|        100.0|
|          HP Elite|    Laptop|1200.0|     800.0|        400.0|
|   Microsoft Lumia|Smartphone| 200.0|      null|         null|
|           HTC One|Smartphone| 400.0|     200.0|        200.0|
|             Nexus|Smartphone| 500.0|     400.0|        100.0|
|            iPhone|Smartphone| 900.0|     500.0|        400.0|
|       Kindle Fire|    Tablet| 150.0|      null|         null|
|Samsung Galaxy Tab|    Tablet| 200.0|     150.0|         50.0|
|              iPad|    Tablet| 700.0|     200.0|        500.0|
+------------------+----------+------+----------+-------------+



## Window functions: LAG + Filling NULLs with COALESCE

In [16]:
query = '''
SELECT
	product_name,
	group_name,
	price,
	COALESCE( LAG (price, 1) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	),0) AS prev_price,
	price - COALESCE(
                     (LAG (price, 1) OVER (
                     PARTITION BY group_name
            		ORDER BY price)),0
	)  AS cur_prev_diff
FROM
	products
INNER JOIN product_groups USING (group_id);
'''
show_query(query)

+------------------+----------+------+----------+-------------+
|      product_name|group_name| price|prev_price|cur_prev_diff|
+------------------+----------+------+----------+-------------+
|   Lenovo Thinkpad|    Laptop| 700.0|       0.0|        700.0|
|         Sony VAIO|    Laptop| 700.0|     700.0|          0.0|
|       Dell Vostro|    Laptop| 800.0|     700.0|        100.0|
|          HP Elite|    Laptop|1200.0|     800.0|        400.0|
|   Microsoft Lumia|Smartphone| 200.0|       0.0|        200.0|
|           HTC One|Smartphone| 400.0|     200.0|        200.0|
|             Nexus|Smartphone| 500.0|     400.0|        100.0|
|            iPhone|Smartphone| 900.0|     500.0|        400.0|
|       Kindle Fire|    Tablet| 150.0|       0.0|        150.0|
|Samsung Galaxy Tab|    Tablet| 200.0|     150.0|         50.0|
|              iPad|    Tablet| 700.0|     200.0|        500.0|
+------------------+----------+------+----------+-------------+



                                                                                

## Window functions: LEAD

In [17]:
query = '''
SELECT
	product_name,
	group_name,
	price,
	LAG (price, 1) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS prev_price,
	price - LEAD (price, 1) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	) AS cur_prev_diff
FROM
	products
INNER JOIN product_groups USING (group_id);
'''
show_query(query)

+------------------+----------+------+----------+-------------+
|      product_name|group_name| price|prev_price|cur_prev_diff|
+------------------+----------+------+----------+-------------+
|   Lenovo Thinkpad|    Laptop| 700.0|      null|          0.0|
|         Sony VAIO|    Laptop| 700.0|     700.0|       -100.0|
|       Dell Vostro|    Laptop| 800.0|     700.0|       -400.0|
|          HP Elite|    Laptop|1200.0|     800.0|         null|
|   Microsoft Lumia|Smartphone| 200.0|      null|       -200.0|
|           HTC One|Smartphone| 400.0|     200.0|       -100.0|
|             Nexus|Smartphone| 500.0|     400.0|       -400.0|
|            iPhone|Smartphone| 900.0|     500.0|         null|
|       Kindle Fire|    Tablet| 150.0|      null|        -50.0|
|Samsung Galaxy Tab|    Tablet| 200.0|     150.0|       -500.0|
|              iPad|    Tablet| 700.0|     200.0|         null|
+------------------+----------+------+----------+-------------+



## Window functions: LEAD + Filling NULLs with COALESCE

In [18]:
query = '''
SELECT
	product_name,
	group_name,
	price,
	COALESCE( LEAD (price, 1) OVER (
		PARTITION BY group_name
		ORDER BY
			price
	),0) AS prev_price,
	price - COALESCE(
                     (LEAD (price, 1) OVER (
                     PARTITION BY group_name
            		ORDER BY price)),0
	)  AS cur_prev_diff
FROM
	products
INNER JOIN product_groups USING (group_id);
'''
show_query(query)

+------------------+----------+------+----------+-------------+
|      product_name|group_name| price|prev_price|cur_prev_diff|
+------------------+----------+------+----------+-------------+
|   Lenovo Thinkpad|    Laptop| 700.0|     700.0|          0.0|
|         Sony VAIO|    Laptop| 700.0|     800.0|       -100.0|
|       Dell Vostro|    Laptop| 800.0|    1200.0|       -400.0|
|          HP Elite|    Laptop|1200.0|       0.0|       1200.0|
|   Microsoft Lumia|Smartphone| 200.0|     400.0|       -200.0|
|           HTC One|Smartphone| 400.0|     500.0|       -100.0|
|             Nexus|Smartphone| 500.0|     900.0|       -400.0|
|            iPhone|Smartphone| 900.0|       0.0|        900.0|
|       Kindle Fire|    Tablet| 150.0|     200.0|        -50.0|
|Samsung Galaxy Tab|    Tablet| 200.0|     700.0|       -500.0|
|              iPad|    Tablet| 700.0|       0.0|        700.0|
+------------------+----------+------+----------+-------------+



# CUME_DIST: works basically like an ECDF

## Creating the example

In [24]:
sale_stats_schema = StructType([ \
    StructField("name",StringType(),True), \
    StructField("year",IntegerType(),True),\
    StructField("amount",IntegerType(),True),\
  ])

sale_stats_data =   [('John Doe',2018,120000),
                    ('Jane Doe',2018,110000),
                    ('Jack Daniel',2018,150000),
                    ('Yin Yang',2018,30000),
                    ('Stephane Heady',2018,200000),
                    ('John Doe',2019,150000),
                    ('Jane Doe',2019,130000),
                    ('Jack Daniel',2019,180000),
                    ('Yin Yang',2019,25000),
                    ('Stephane Heady',2019,270000)]

rdd = spark.sparkContext.parallelize(sale_stats_data)
spark.createDataFrame(rdd, schema = sale_stats_schema).createOrReplaceTempView('sales_stats')

## Without partition

In [26]:
query = """SELECT 
    name,
    year, 
    amount,
    CUME_DIST() OVER (
        ORDER BY amount
    ) AS ECDF
FROM 
    sales_stats
WHERE 
    year = 2018;
"""
show_query(query)

23/12/27 19:23:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:23:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:23:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------+----+------+----+
|          name|year|amount|ECDF|
+--------------+----+------+----+
|      Yin Yang|2018| 30000| 0.2|
|      Jane Doe|2018|110000| 0.4|
|      John Doe|2018|120000| 0.6|
|   Jack Daniel|2018|150000| 0.8|
|Stephane Heady|2018|200000| 1.0|
+--------------+----+------+----+



23/12/27 19:23:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:23:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


## With partition

In [31]:
query = """ SELECT 
    name,
	year,
	amount,
    CUME_DIST() OVER (
		PARTITION BY year
        ORDER BY amount
    ) AS ECDF
FROM 
    sales_stats
"""
show_query(query)

+--------------+----+------+----+
|          name|year|amount|ECDF|
+--------------+----+------+----+
|      Yin Yang|2018| 30000| 0.2|
|      Jane Doe|2018|110000| 0.4|
|      John Doe|2018|120000| 0.6|
|   Jack Daniel|2018|150000| 0.8|
|Stephane Heady|2018|200000| 1.0|
|      Yin Yang|2019| 25000| 0.2|
|      Jane Doe|2019|130000| 0.4|
|      John Doe|2019|150000| 0.6|
|   Jack Daniel|2019|180000| 0.8|
|Stephane Heady|2019|270000| 1.0|
+--------------+----+------+----+



## NTILE: works like qcut

In [33]:
query = '''
SELECT 
	name,
	amount,
	NTILE(3) OVER(
		ORDER BY amount
	) n_tile
FROM
	sales_stats
WHERE
	year = 2019
 '''
show_query(query)

23/12/27 19:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------+------+------+
|          name|amount|n_tile|
+--------------+------+------+
|      Yin Yang| 25000|     1|
|      Jane Doe|130000|     1|
|      John Doe|150000|     2|
|   Jack Daniel|180000|     2|
|Stephane Heady|270000|     3|
+--------------+------+------+



23/12/27 19:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:31:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


# Pandas Comparison

In [42]:
df = spark.sql('SELECT name,amount FROM sales_stats WHERE year = 2019').toPandas()

In [54]:
df['ntile'] = pd.qcut(df['amount'],q = 3,labels = range(1,4))
df.sort_values(by = ['amount','ntile'])

Unnamed: 0,name,amount,ntile
3,Yin Yang,25000,1
1,Jane Doe,130000,1
0,John Doe,150000,2
2,Jack Daniel,180000,3
4,Stephane Heady,270000,3


# NTH_VALUE

## Without partition (BAD PRACTICE!)

In [56]:
query = """
SELECT 
    product_id,
    product_name,
    price,
    NTH_VALUE(product_name, 2) 
    OVER(
        ORDER BY price DESC
        RANGE BETWEEN 
            UNBOUNDED PRECEDING AND 
            UNBOUNDED FOLLOWING
    ) AS nth_value_call
FROM 
    products;
"""
show_query(query)

23/12/27 19:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+------+--------------+
|product_id|      product_name| price|nth_value_call|
+----------+------------------+------+--------------+
|         5|          HP Elite|1200.0|        iPhone|
|         4|            iPhone| 900.0|        iPhone|
|         8|       Dell Vostro| 800.0|        iPhone|
|         6|   Lenovo Thinkpad| 700.0|        iPhone|
|         7|         Sony VAIO| 700.0|        iPhone|
|         9|              iPad| 700.0|        iPhone|
|         3|             Nexus| 500.0|        iPhone|
|         2|           HTC One| 400.0|        iPhone|
|         1|   Microsoft Lumia| 200.0|        iPhone|
|        11|Samsung Galaxy Tab| 200.0|        iPhone|
|        10|       Kindle Fire| 150.0|        iPhone|
+----------+------------------+------+--------------+



23/12/27 19:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 19:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


## With partition

In [60]:
query = """
SELECT 
    product_id,
    product_name,
    price,
    group_id,
    NTH_VALUE(product_name, 2) 
    OVER(
        PARTITION BY group_id
        ORDER BY price DESC
        RANGE BETWEEN 
            UNBOUNDED PRECEDING AND 
            UNBOUNDED FOLLOWING
    ) AS nth_value_call
FROM 
    products;
"""
show_query(query)

+----------+------------------+------+--------+------------------+
|product_id|      product_name| price|group_id|    nth_value_call|
+----------+------------------+------+--------+------------------+
|         4|            iPhone| 900.0|       1|             Nexus|
|         3|             Nexus| 500.0|       1|             Nexus|
|         2|           HTC One| 400.0|       1|             Nexus|
|         1|   Microsoft Lumia| 200.0|       1|             Nexus|
|         5|          HP Elite|1200.0|       2|       Dell Vostro|
|         8|       Dell Vostro| 800.0|       2|       Dell Vostro|
|         6|   Lenovo Thinkpad| 700.0|       2|       Dell Vostro|
|         7|         Sony VAIO| 700.0|       2|       Dell Vostro|
|         9|              iPad| 700.0|       3|Samsung Galaxy Tab|
|        11|Samsung Galaxy Tab| 200.0|       3|Samsung Galaxy Tab|
|        10|       Kindle Fire| 150.0|       3|Samsung Galaxy Tab|
+----------+------------------+------+--------+---------------

## If NTH_VALUE parameter greater than the number of occurencies, null appears

In [61]:
query = """
SELECT 
    product_id,
    product_name,
    price,
    group_id,
    NTH_VALUE(product_name, 4) 
    OVER(
        PARTITION BY group_id
        ORDER BY price DESC
        RANGE BETWEEN 
            UNBOUNDED PRECEDING AND 
            UNBOUNDED FOLLOWING
    ) AS nth_value_call
FROM 
    products;
"""
show_query(query)

+----------+------------------+------+--------+---------------+
|product_id|      product_name| price|group_id| nth_value_call|
+----------+------------------+------+--------+---------------+
|         4|            iPhone| 900.0|       1|Microsoft Lumia|
|         3|             Nexus| 500.0|       1|Microsoft Lumia|
|         2|           HTC One| 400.0|       1|Microsoft Lumia|
|         1|   Microsoft Lumia| 200.0|       1|Microsoft Lumia|
|         5|          HP Elite|1200.0|       2|      Sony VAIO|
|         8|       Dell Vostro| 800.0|       2|      Sony VAIO|
|         6|   Lenovo Thinkpad| 700.0|       2|      Sony VAIO|
|         7|         Sony VAIO| 700.0|       2|      Sony VAIO|
|         9|              iPad| 700.0|       3|           null|
|        11|Samsung Galaxy Tab| 200.0|       3|           null|
|        10|       Kindle Fire| 150.0|       3|           null|
+----------+------------------+------+--------+---------------+



# PERCENT_RANK

In [63]:
query = """
SELECT 
    name,
	amount,
    PERCENT_RANK() OVER (
		PARTITION BY year
        ORDER BY amount
    ) AS percent_rank_call
FROM 
    sales_stats;"""
show_query(query)

+--------------+------+-----------------+
|          name|amount|percent_rank_call|
+--------------+------+-----------------+
|      Yin Yang| 30000|              0.0|
|      Jane Doe|110000|             0.25|
|      John Doe|120000|              0.5|
|   Jack Daniel|150000|             0.75|
|Stephane Heady|200000|              1.0|
|      Yin Yang| 25000|              0.0|
|      Jane Doe|130000|             0.25|
|      John Doe|150000|              0.5|
|   Jack Daniel|180000|             0.75|
|Stephane Heady|270000|              1.0|
+--------------+------+-----------------+



# ROW NUMBER

## Basic Usages

In [66]:
query = """
SELECT
	product_id,
	product_name,
	group_id,
	ROW_NUMBER () OVER (ORDER BY product_id) AS row_number_call
FROM
	products;
"""
show_query(query)

23/12/27 20:05:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:05:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:05:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+--------+---------------+
|product_id|      product_name|group_id|row_number_call|
+----------+------------------+--------+---------------+
|         1|   Microsoft Lumia|       1|              1|
|         2|           HTC One|       1|              2|
|         3|             Nexus|       1|              3|
|         4|            iPhone|       1|              4|
|         5|          HP Elite|       2|              5|
|         6|   Lenovo Thinkpad|       2|              6|
|         7|         Sony VAIO|       2|              7|
|         8|       Dell Vostro|       2|              8|
|         9|              iPad|       3|              9|
|        10|       Kindle Fire|       3|             10|
|        11|Samsung Galaxy Tab|       3|             11|
+----------+------------------+--------+---------------+



In [69]:
query = """
SELECT
	product_id,
	product_name,
	group_id,
	ROW_NUMBER () OVER (
           ORDER BY product_name
        ) AS row_number_call
FROM
	products;
"""
show_query(query)

23/12/27 20:06:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:06:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:06:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+--------+---------------+
|product_id|      product_name|group_id|row_number_call|
+----------+------------------+--------+---------------+
|         8|       Dell Vostro|       2|              1|
|         5|          HP Elite|       2|              2|
|         2|           HTC One|       1|              3|
|        10|       Kindle Fire|       3|              4|
|         6|   Lenovo Thinkpad|       2|              5|
|         1|   Microsoft Lumia|       1|              6|
|         3|             Nexus|       1|              7|
|        11|Samsung Galaxy Tab|       3|              8|
|         7|         Sony VAIO|       2|              9|
|         9|              iPad|       3|             10|
|         4|            iPhone|       1|             11|
+----------+------------------+--------+---------------+



## With partition (Good Practice)

In [71]:
query = """
SELECT
	product_id,
	product_name,
	group_id,
	ROW_NUMBER () OVER (
		PARTITION BY group_id
		ORDER BY
			product_name
	) AS row_number_call
FROM
	products;
"""
show_query(query)

+----------+------------------+--------+---------------+
|product_id|      product_name|group_id|row_number_call|
+----------+------------------+--------+---------------+
|         2|           HTC One|       1|              1|
|         1|   Microsoft Lumia|       1|              2|
|         3|             Nexus|       1|              3|
|         4|            iPhone|       1|              4|
|         8|       Dell Vostro|       2|              1|
|         5|          HP Elite|       2|              2|
|         6|   Lenovo Thinkpad|       2|              3|
|         7|         Sony VAIO|       2|              4|
|        10|       Kindle Fire|       3|              1|
|        11|Samsung Galaxy Tab|       3|              2|
|         9|              iPad|       3|              3|
+----------+------------------+--------+---------------+



## ROW_NUMBER and DISTINCT

In [73]:
query = """SELECT DISTINCT
	price,
	ROW_NUMBER () OVER (ORDER BY price) AS row_number_call
FROM
	products
ORDER BY
	price;"""
show_query(query)

23/12/27 20:08:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:08:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+---------------+
| price|row_number_call|
+------+---------------+
| 150.0|              1|
| 200.0|              2|
| 200.0|              3|
| 400.0|              4|
| 500.0|              5|
| 700.0|              6|
| 700.0|              7|
| 700.0|              8|
| 800.0|              9|
| 900.0|             10|
|1200.0|             11|
+------+---------------+



23/12/27 20:08:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:08:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


## Removing duplicate price with a CTE

In [75]:
query = """WITH prices AS (
	SELECT DISTINCT
		price
	FROM
		products
) SELECT
	price,
	ROW_NUMBER () OVER (ORDER BY price) AS row_number_call
FROM
	prices;"""
show_query(query)

23/12/27 20:09:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:09:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:09:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+---------------+
| price|row_number_call|
+------+---------------+
| 150.0|              1|
| 200.0|              2|
| 400.0|              3|
| 500.0|              4|
| 700.0|              5|
| 800.0|              6|
| 900.0|              7|
|1200.0|              8|
+------+---------------+



23/12/27 20:09:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:09:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:09:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


## Subquery alternative

In [77]:
query = """
SELECT
	price,
	ROW_NUMBER () OVER (ORDER BY price) AS row_number_call
FROM
	(
		SELECT DISTINCT
			price
		FROM
			products
	) prices;
"""
show_query(query)

23/12/27 20:10:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:10:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:10:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------+---------------+
| price|row_number_call|
+------+---------------+
| 150.0|              1|
| 200.0|              2|
| 400.0|              3|
| 500.0|              4|
| 700.0|              5|
| 800.0|              6|
| 900.0|              7|
|1200.0|              8|
+------+---------------+



23/12/27 20:10:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:10:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:10:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


# ROW_NUMBER as a Pagination

In [84]:
query = """
SELECT
	*
FROM
	(
		SELECT
			product_id,
			product_name,
			price,
			ROW_NUMBER () OVER (ORDER BY product_name) AS ROW_NUMBER
		FROM
			products
	)
 WHERE
	ROW_NUMBER BETWEEN 6 AND 10;;"""
show_query(query)

23/12/27 20:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:13:38 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------------+-----+----------+
|product_id|      product_name|price|ROW_NUMBER|
+----------+------------------+-----+----------+
|         1|   Microsoft Lumia|200.0|         6|
|         3|             Nexus|500.0|         7|
|        11|Samsung Galaxy Tab|200.0|         8|
|         7|         Sony VAIO|700.0|         9|
|         9|              iPad|700.0|        10|
+----------+------------------+-----+----------+



23/12/27 20:13:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:13:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [85]:
query = """
SELECT
	*
FROM
	products
WHERE
	price = (
		SELECT
			price
		FROM
			(
				SELECT
					price,
					ROW_NUMBER () OVER (
						ORDER BY price DESC
					) nth
				FROM
					(
						SELECT DISTINCT
							(price)
						FROM
							products
					) prices
			) sorted_prices
		WHERE
			nth = 3
	);
"""
show_query(query)

23/12/27 20:14:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:14:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:14:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:14:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:14:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/27 20:14:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------------+--------+-----+
|product_id|product_name|group_id|price|
+----------+------------+--------+-----+
|         8| Dell Vostro|       2|800.0|
+----------+------------+--------+-----+



#  Exercise: redo all the above using pyspark when appliable