In [1]:
from db_tools.setup import setup

In [5]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [3]:
spark = setup()

23/12/20 18:38:01 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/12/20 18:38:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/12/20 18:38:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [4]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

# GROUPING SETS

## Creating the example table

In [10]:
brand_schema = StructType([ \
    StructField("brand",StringType(),True), \
    StructField("segment",StringType(),True), \
    StructField("quantity",StringType(),True)
  ])
brands =   [('ABC','Premium',100),
            ('ABC','Basic',200),
            ('XYZ','Premium',100),
            ('XYZ','Basic',300)]
columns = ['brand','segment','quantity']
rdd = spark.sparkContext.parallelize(brands)
spark.createDataFrame(rdd, schema = brand_schema).createOrReplaceTempView('sales')

## GROUP BY recap

In [11]:
query = '''SELECT
    brand,
    segment,
    SUM (quantity)
FROM
    sales
GROUP BY
    brand,
    segment;'''
show_query(query)

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
|  ABC|Premium|        100.0|
|  ABC|  Basic|        200.0|
|  XYZ|Premium|        100.0|
|  XYZ|  Basic|        300.0|
+-----+-------+-------------+



In [12]:
query = '''SELECT
    brand,
    SUM (quantity)
FROM
    sales
GROUP BY
    brand;'''
show_query(query)

+-----+-------------+
|brand|sum(quantity)|
+-----+-------------+
|  ABC|        300.0|
|  XYZ|        400.0|
+-----+-------------+



## Getting the totals

In [13]:
show_query('SELECT SUM (quantity) FROM sales;')

+-------------+
|sum(quantity)|
+-------------+
|        700.0|
+-------------+



## Grouping Sets motivation

In [14]:
query = """SELECT
    brand,
    segment,
    SUM (quantity)
FROM
    sales
GROUP BY
    brand,
    segment

UNION ALL

SELECT
    brand,
    NULL,
    SUM (quantity)
FROM
    sales
GROUP BY
    brand

UNION ALL

SELECT
    NULL,
    segment,
    SUM (quantity)
FROM
    sales
GROUP BY
    segment

UNION ALL

SELECT
    NULL,
    NULL,
    SUM (quantity)
FROM
    sales;"""
show_query(query)

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
|  ABC|Premium|        100.0|
|  ABC|  Basic|        200.0|
|  XYZ|Premium|        100.0|
|  XYZ|  Basic|        300.0|
|  ABC|   null|        300.0|
|  XYZ|   null|        400.0|
| null|Premium|        200.0|
| null|  Basic|        500.0|
| null|   null|        700.0|
+-----+-------+-------------+



## Grouping Sets syntax

```
SELECT
    c1,
    c2,
    aggregate_function(c3)
FROM
    table_name
GROUP BY
    GROUPING SETS (
        (c1, c2),
        (c1),
        (c2),
        ()
);
```

## Grouping Sets example

In [18]:
query = """
SELECT
    brand,
    segment,
    SUM (quantity)
FROM
    sales
GROUP BY
    GROUPING SETS (
        (brand, segment),
        (brand),
        (segment),
        ()
    );
"""
show_query(query)

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
| null|Premium|        200.0|
|  ABC|Premium|        100.0|
| null|   null|        700.0|
|  ABC|   null|        300.0|
| null|  Basic|        500.0|
|  ABC|  Basic|        200.0|
|  XYZ|Premium|        100.0|
|  XYZ|   null|        400.0|
|  XYZ|  Basic|        300.0|
+-----+-------+-------------+



## Grouping function

In [19]:
query = """
SELECT
	GROUPING(brand) grouping_brand,
	GROUPING(segment) grouping_segment,
	brand,
	segment,
	SUM (quantity)
FROM
	sales
GROUP BY
	GROUPING SETS (
		(brand),
		(segment),
		()
	)
ORDER BY
	brand,
	segment;

"""

show_query(query)

+--------------+----------------+-----+-------+-------------+
|grouping_brand|grouping_segment|brand|segment|sum(quantity)|
+--------------+----------------+-----+-------+-------------+
|             1|               1| null|   null|        700.0|
|             1|               0| null|  Basic|        500.0|
|             1|               0| null|Premium|        200.0|
|             0|               1|  ABC|   null|        300.0|
|             0|               1|  XYZ|   null|        400.0|
+--------------+----------------+-----+-------+-------------+



## Finding subtotals: HAVING GROUPING

In [20]:
query = """
SELECT
	GROUPING(brand) grouping_brand,
	GROUPING(segment) grouping_segment,
	brand,
	segment,
	SUM (quantity)
FROM
	sales
GROUP BY
	GROUPING SETS (
		(brand),
		(segment),
		()
	)
HAVING GROUPING(brand) = 0	
ORDER BY
	brand,
	segment;
"""

show_query(query)

+--------------+----------------+-----+-------+-------------+
|grouping_brand|grouping_segment|brand|segment|sum(quantity)|
+--------------+----------------+-----+-------+-------------+
|             0|               1|  ABC|   null|        300.0|
|             0|               1|  XYZ|   null|        400.0|
+--------------+----------------+-----+-------+-------------+



# ROLLUP

## Syntax A

```
SELECT
    c1,
    c2,
    c3,
    aggregate(c4)
FROM
    table_name
GROUP BY
    ROLLUP (c1, c2, c3);
```

## Syntax B

```
SELECT
    c1,
    c2,
    c3,
    aggregate(c4)
FROM
    table_name
GROUP BY
    c1, 
    ROLLUP (c2, c3);
```

## Examples

In [23]:
query = '''
SELECT
    brand,
    segment,
    SUM (quantity)
FROM
    sales
GROUP BY
    ROLLUP (brand, segment)
ORDER BY
    brand,
    segment;
'''
show_query(query)

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
| null|   null|        700.0|
|  ABC|   null|        300.0|
|  ABC|  Basic|        200.0|
|  ABC|Premium|        100.0|
|  XYZ|   null|        400.0|
|  XYZ|  Basic|        300.0|
|  XYZ|Premium|        100.0|
+-----+-------+-------------+



In [24]:
query = '''
SELECT
    segment,
    brand,
    SUM (quantity)
FROM
    sales
GROUP BY
    ROLLUP (segment, brand)
ORDER BY
    segment,
    brand;

'''
show_query(query)

+-------+-----+-------------+
|segment|brand|sum(quantity)|
+-------+-----+-------------+
|   null| null|        700.0|
|  Basic| null|        500.0|
|  Basic|  ABC|        200.0|
|  Basic|  XYZ|        300.0|
|Premium| null|        200.0|
|Premium|  ABC|        100.0|
|Premium|  XYZ|        100.0|
+-------+-----+-------------+



## Partial Rollup

In [25]:
query = '''
SELECT
    segment,
    brand,
    SUM (quantity)
FROM
    sales
GROUP BY
    segment,
    ROLLUP (brand)
ORDER BY
    segment,
    brand;

'''
show_query(query)

+-------+-----+-------------+
|segment|brand|sum(quantity)|
+-------+-----+-------------+
|  Basic| null|        500.0|
|  Basic|  ABC|        200.0|
|  Basic|  XYZ|        300.0|
|Premium| null|        200.0|
|Premium|  ABC|        100.0|
|Premium|  XYZ|        100.0|
+-------+-----+-------------+



# Example with dvdrental database

In [30]:
query = '''
SELECT
    EXTRACT (YEAR FROM rental_date) AS y,
    EXTRACT (MONTH FROM rental_date) AS M,
    EXTRACT (DAY FROM rental_date) AS d,
    COUNT (rental_id)
FROM
    rental
GROUP BY
    ROLLUP (
        EXTRACT (YEAR FROM rental_date),
        EXTRACT (MONTH FROM rental_date),
        EXTRACT (DAY FROM rental_date)
    );

'''
show_query(query)

+----+----+----+----------------+
|   y|   M|   d|count(rental_id)|
+----+----+----+----------------+
|2005|   7|  27|             649|
|2005|   5|  26|             174|
|2005|   8|  22|             626|
|2005|   7|   9|             513|
|null|null|null|           16044|
|2005|   5|  24|               8|
|2005|   7|   6|             504|
|2005|   5|  28|             196|
|2005|   8|  20|             624|
|2005|   8|  17|             593|
|2006|   2|  14|             182|
|2005|   7|  26|              33|
|2005|   6|  14|              16|
|2005|   8|  23|             598|
|2005|   5|  25|             137|
|2005|   7|  10|             480|
|2005|   8|null|            5686|
|2005|   5|  29|             154|
|2005|   6|null|            2311|
|2005|   8|   1|             671|
+----+----+----+----------------+
only showing top 20 rows



# CUBE

## Syntax

```
SELECT
    c1,
    c2,
    c3,
    aggregate (c4)
FROM
    table_name
GROUP BY
    CUBE (c1, c2, c3);
```

## Equivalence

```
CUBE(c1,c2,c3) 

GROUPING SETS (
    (c1,c2,c3), 
    (c1,c2),
    (c1,c3),
    (c2,c3),
    (c1),
    (c2),
    (c3), 
    ()
 ) 

```

## Example 1

In [34]:
query = '''
SELECT
    brand,
    segment,
    SUM (quantity)
FROM
    sales
GROUP BY
    CUBE (brand, segment)
ORDER BY
    brand,
    segment;
'''
show_query(query)

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
| null|   null|        700.0|
| null|  Basic|        500.0|
| null|Premium|        200.0|
|  ABC|   null|        300.0|
|  ABC|  Basic|        200.0|
|  ABC|Premium|        100.0|
|  XYZ|   null|        400.0|
|  XYZ|  Basic|        300.0|
|  XYZ|Premium|        100.0|
+-----+-------+-------------+



## Example 2

In [35]:
query = """
SELECT
    brand,
    segment,
    SUM (quantity)
FROM
    sales
GROUP BY
    brand,
    CUBE (segment)
ORDER BY
    brand,
    segment;
"""
show_query(query)

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
|  ABC|   null|        300.0|
|  ABC|  Basic|        200.0|
|  ABC|Premium|        100.0|
|  XYZ|   null|        400.0|
|  XYZ|  Basic|        300.0|
|  XYZ|Premium|        100.0|
+-----+-------+-------------+



# Exercise: repeat all the above using the pyspark API