## **[MySQL ROLLUP](https://www.mysqltutorial.org/mysql-rollup/)**

Use the MySQL ROLLUP clause to generate subtotals and grand totals.

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from IPython.display import Image, SVG

from sqlalchemy_utils import database_exists, create_database
from sqlalchemy import create_engine, inspect, MetaData, text
from sqlalchemy_schemadisplay import create_schema_graph
import pymysql

pd.set_option(
    'display.max_columns', None,
    'expand_frame_repr', True,
    'display.max_colwidth', None,
    'display.max_rows', 10,
)

pd.set_option('display.width', 65)

In [2]:
# connect to the classicmodels database
connect_args={'ssl':{'fake_flag_to_enable_tls': True}}

engine = create_engine('mysql+pymysql://namlq:abc123@localhost/classicmodels',
                       connect_args=connect_args, echo=False
                         )
inspector = inspect(engine)

### Setting up a sample table

Creates a new table named `table` that stores the order values summarized by product lines and years.

In [3]:
engine.execute('DROP TABLE IF EXISTS sales')

string = '''
CREATE TABLE sales
SELECT
    productLine,
    YEAR(orderDate) orderYear,
    SUM(quantityOrdered * priceEach) orderValue
FROM orderdetails
INNER JOIN orders USING (orderNumber)
INNER JOIN products USING (productCode)
GROUP BY productLine, YEAR(orderDate)
;'''

engine.execute(string)

  engine.execute('DROP TABLE IF EXISTS sales')


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f3d1c4200a0>

In [4]:
pd.read_sql('SELECT * FROM sales', engine).head()

Unnamed: 0,productLine,orderYear,orderValue
0,Vintage Cars,2003,619161.48
1,Classic Cars,2003,1374832.22
2,Trucks and Buses,2003,376657.12
3,Trains,2003,65822.05
4,Ships,2003,222182.08


### MySQL ROLL UP Overview

In [5]:
# create a grouping set denoted by productLine
string = '''
SELECT productLine, SUM(orderValue) totalOrderValue
FROM sales
GROUP BY productLine
ORDER BY productLine ASC
;'''

df1 = pd.read_sql(string, engine)

In [6]:
df2 = (
    pd.read_sql_table('sales', engine)
    .groupby('productLine', as_index=False)
    .agg(totalOrderValue = pd.NamedAgg(
        column='orderValue', aggfunc='sum'))
)

In [7]:
df1.equals(df2)

False

In [8]:
(df1 != df2).sum()

productLine        0
totalOrderValue    2
dtype: int64

In [9]:
(df1.totalOrderValue - df2.totalOrderValue).abs().sum()

3.4924596548080444e-10

In [10]:
df1

Unnamed: 0,productLine,totalOrderValue
0,Classic Cars,3853922.49
1,Motorcycles,1121426.12
2,Planes,954637.54
3,Ships,663998.34
4,Trains,188532.92
5,Trucks and Buses,1024113.57
6,Vintage Cars,1797559.63


In [11]:
# create an empty grouping set
string = '''
SELECT
    SUM(orderValue) totalOrderValue
FROM
    sales
;'''

pd.read_sql(string, engine)

Unnamed: 0,totalOrderValue
0,9604190.61


In [12]:
pd.DataFrame(
    pd.read_sql_table('sales', engine)
    [['orderValue']]
    .sum()
    .rename('totalOrderValue')
    .reset_index(drop=True)
)

Unnamed: 0,totalOrderValue
0,9604190.61


- Generate two or more grouping sets together in one query using UNION ALL clause

In [13]:
string = '''
SELECT
    productLine,
    SUM(orderValue) totalOrderValue
FROM sales
GROUP BY productLine
UNION ALL
SELECT
    NULL, # UNION ALL requires all queries to have the same number of columns
    SUM(orderValue) totalOrderValue
FROM sales
;'''

pd.read_sql(string, engine)

Unnamed: 0,productLine,totalOrderValue
0,Vintage Cars,1797559.63
1,Classic Cars,3853922.49
2,Trucks and Buses,1024113.57
3,Trains,188532.92
4,Ships,663998.34
5,Planes,954637.54
6,Motorcycles,1121426.12
7,,9604190.61


- Generate two or more grouping sets together in one query using ROLLUP clause:

In [14]:
string = '''
SELECT productLine, SUM(orderValue) totalOrderValue
FROM sales
GROUP BY productLine WITH ROLLUP
;'''

df1 = pd.read_sql(string, engine)

In [15]:
df2 = (
    pd.read_sql_table('sales', engine)
    .groupby('productLine', as_index=False)
    .agg(totalOrderValue=pd.NamedAgg(
        column='orderValue', aggfunc='sum'))
)

agg_row = pd.DataFrame(
    df2[['totalOrderValue']]
    .sum()
).T

df2 = pd.concat([df2, agg_row], ignore_index=True)

In [16]:
df1.equals(df2)

False

In [17]:
(df1 != df2).sum()

productLine        1
totalOrderValue    2
dtype: int64

In [18]:
(df1.totalOrderValue - df2.totalOrderValue).abs().sum()

3.4924596548080444e-10

In [19]:
df1

Unnamed: 0,productLine,totalOrderValue
0,Classic Cars,3853922.49
1,Motorcycles,1121426.12
2,Planes,954637.54
3,Ships,663998.34
4,Trains,188532.92
5,Trucks and Buses,1024113.57
6,Vintage Cars,1797559.63
7,,9604190.61


- Two columns specified in the `GROUP BY` clause:

In [20]:
pd.set_option('display.max_rows', 50)

In [21]:
string = '''
SELECT
    productLine,
    orderYear,
    SUM(orderValue) totalOrderValue
FROM sales
GROUP BY productLine, orderYear WITH ROLLUP
ORDER BY
    # https://stackoverflow.com/a/41237266/2757266
    productLine IS NULL, # NULL last
    productLine,
    orderYear IS NULL,
    orderYear
;'''

df1 = pd.read_sql(string, engine)

In [22]:
df2 = (
    pd.read_sql_table('sales', engine)
    .groupby(['productLine', 'orderYear'], as_index=False)
    .agg(totalOrderValue=pd.NamedAgg(
        column='orderValue', aggfunc='sum'))
)

agg_pl = (
    df2
    .groupby('productLine', as_index=False)
    ['totalOrderValue']
    .sum()
)

agg_all = pd.DataFrame(
    df2
    [['totalOrderValue']]
    .sum()
).T

df2 = (pd.concat([df2, agg_pl, agg_all])
 .sort_values(['productLine', 'orderYear'], na_position='last')
 .reset_index(drop=True)
)

In [23]:
df1.equals(df2)

False

In [24]:
(df1 != df2).sum()

productLine        1
orderYear          8
totalOrderValue    3
dtype: int64

In [25]:
df1.isna().sum()

productLine        1
orderYear          8
totalOrderValue    0
dtype: int64

In [26]:
(df1.totalOrderValue - df2.totalOrderValue).abs().sum()

2.2118911147117615e-09

In [27]:
df1

Unnamed: 0,productLine,orderYear,totalOrderValue
0,Classic Cars,2003.0,1374832.22
1,Classic Cars,2004.0,1763136.73
2,Classic Cars,2005.0,715953.54
3,Classic Cars,,3853922.49
4,Motorcycles,2003.0,348909.24
5,Motorcycles,2004.0,527243.84
6,Motorcycles,2005.0,245273.04
7,Motorcycles,,1121426.12
8,Planes,2003.0,309784.2
9,Planes,2004.0,471971.46


Reverse the herarchy:

In [28]:
string = '''
SELECT
    orderYear,
    productLine,
    SUM(orderValue) totalOrderValue
FROM sales
GROUP BY orderYear, productLine
WITH ROLLUP
;'''

df1 = pd.read_sql(string, engine)

In [29]:
df2 = (
    pd.read_sql_table('sales', engine)
    .groupby(['orderYear', 'productLine'], as_index=False)
    .agg(totalOrderValue = pd.NamedAgg(
        column='orderValue', aggfunc='sum'))
    [['orderYear', 'productLine', 'totalOrderValue']]
    .sort_values(by=['orderYear', 'productLine'])
)

agg_year = (
    df2
    .groupby('orderYear', as_index=False)
    .agg({'totalOrderValue': 'sum'})
    [['orderYear', 'totalOrderValue']]
)
agg_all = pd.DataFrame(
    agg_year[['totalOrderValue']].sum()
).T


df2 = (
    pd.concat([df2, agg_year, agg_all], axis=0)
    .sort_values(['orderYear', 'productLine'])
    .reset_index(drop=True)   
)

In [30]:
(df1 != df2).sum()

orderYear          1
productLine        4
totalOrderValue    1
dtype: int64

In [31]:
df1.isna().sum()

orderYear          1
productLine        4
totalOrderValue    0
dtype: int64

In [32]:
(df1.totalOrderValue - df2.totalOrderValue).abs().sum()

4.656612873077393e-10

In [33]:
df1

Unnamed: 0,orderYear,productLine,totalOrderValue
0,2003.0,Classic Cars,1374832.22
1,2003.0,Motorcycles,348909.24
2,2003.0,Planes,309784.2
3,2003.0,Ships,222182.08
4,2003.0,Trains,65822.05
5,2003.0,Trucks and Buses,376657.12
6,2003.0,Vintage Cars,619161.48
7,2003.0,,3317348.39
8,2004.0,Classic Cars,1763136.73
9,2004.0,Motorcycles,527243.84


### The GROUPING() function

To check whether `NULL` in the result set represents the subtotals or grandtotals.

In [34]:
# GROUPING(orderYear) returns 1 when NULL in the orderYear, 0 otherwise
string = '''
SELECT
    orderYear,
    productLine,
    SUM(orderValue) totalOrderValue,
    GROUPING(orderYear) AS GROUPING_orderYear,
    GROUPING(productLine) AS GROUPING_productLine
FROM sales
GROUP BY orderYear, productLine WITH ROLLUP
;'''

df1 = pd.read_sql_query(string, engine)

In [35]:
df2 = (
    pd.read_sql_table('sales', engine)
    .groupby(['orderYear', 'productLine'], as_index=False)
    .agg(totalOrderValue = pd.NamedAgg(
        column='orderValue', aggfunc='sum'))
    [['orderYear', 'productLine', 'totalOrderValue']]
)

agg_pl = (
    df2
    .groupby('orderYear', as_index=False)
    [['totalOrderValue']]
    .sum()
)

agg_all = pd.DataFrame(
    agg_pl[['totalOrderValue']].sum()
).T

df2 = (
    pd.concat([df2, agg_pl, agg_all], axis=0)
    .assign(GROUPING_orderYear = lambda df: df.orderYear.isna().astype(int),
            GROUPING_productLine = lambda df: df.productLine.isna().astype(int))
    .sort_values(['orderYear', 'productLine'], ignore_index=True)
)

In [36]:
df1.equals(df2)

False

In [37]:
(df1 != df2).sum()

orderYear               1
productLine             4
totalOrderValue         1
GROUPING_orderYear      0
GROUPING_productLine    0
dtype: int64

In [38]:
df1.isna().sum()

orderYear               1
productLine             4
totalOrderValue         0
GROUPING_orderYear      0
GROUPING_productLine    0
dtype: int64

In [39]:
(df1.totalOrderValue - df2.totalOrderValue).abs().sum()

4.656612873077393e-10

In [40]:
df1

Unnamed: 0,orderYear,productLine,totalOrderValue,GROUPING_orderYear,GROUPING_productLine
0,2003.0,Classic Cars,1374832.22,0,0
1,2003.0,Motorcycles,348909.24,0,0
2,2003.0,Planes,309784.2,0,0
3,2003.0,Ships,222182.08,0,0
4,2003.0,Trains,65822.05,0,0
5,2003.0,Trucks and Buses,376657.12,0,0
6,2003.0,Vintage Cars,619161.48,0,0
7,2003.0,,3317348.39,0,1
8,2004.0,Classic Cars,1763136.73,0,0
9,2004.0,Motorcycles,527243.84,0,0


### combine IF() function with GROUPING() function

In [41]:
# substitute labels for the super-aggregate NULL values
# in orderYear and productLine columns
string = '''
SELECT
    IF(GROUPING(orderYear), 'ALL Years', orderYear) orderYear,
    IF(GROUPING(productLine), 'ALL Product Lines', productLine) productLine,
    SUM(orderValue) totalOrderValue
FROM sales
GROUP BY orderYear, productLine WITH ROLLUP
;'''

df1 = pd.read_sql(string, engine)

In [42]:
df2 = (
    pd.read_sql_table('sales', engine)
    .groupby(['orderYear', 'productLine'], as_index=False)
    .agg(totalOrderValue = pd.NamedAgg(
        column='orderValue', aggfunc='sum'))
    [['orderYear', 'productLine', 'totalOrderValue']]
)

agg_year = (
    df2
    .groupby('orderYear', as_index=False)
    .agg({'totalOrderValue': 'sum'})
    .assign(productLine = 'ALL Product Lines')
)

agg_all = pd.DataFrame(
    agg_year[['totalOrderValue']].sum()
).T

agg_all = agg_all.assign(
    productLine = 'ALL Product Lines',
    orderYear = 'ALL Years')

df2 = pd.concat([df2, agg_year, agg_all], axis=0)

categories = ['Classic Cars', 'Motorcycles', 'Planes', 'Ships', 'Trains',
              'Trucks and Buses', 'Vintage Cars', 'ALL Product Lines']
df2 = (
    df2
    .assign(productLine = pd.Categorical(
        df2.productLine, categories=categories, ordered=True),
            orderYear = df2.orderYear.astype(str))
    .sort_values(['orderYear', 'productLine'], ignore_index=True)
)

In [43]:
df1 = (
    df1
    .assign(productLine = pd.Categorical(
        df1.productLine, categories=categories, ordered=True),
            orderYear = df1.orderYear.astype(str))
)

In [44]:
df1.equals(df2)

False

In [45]:
(df1 != df2).sum()

orderYear          0
productLine        0
totalOrderValue    1
dtype: int64

In [46]:
(df1.totalOrderValue - df2.totalOrderValue).abs().sum()

4.656612873077393e-10

In [47]:
engine.execute('DROP TABLE IF EXISTS sales;')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f3ca79f59a0>

In [48]:
pd.set_option('display.max_rows', 10) 