In [34]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from itertools import product
from pyspark.sql.window import Window

In [35]:
spark = SparkSession.builder \
    .appName("basic_app") \
    .getOrCreate()

# Loading the Data

In [36]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("../../../spark_data_examples/retail-data/all/*.csv")\
.coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

24/01/01 14:17:25 WARN CacheManager: Asked to cache already cached data.


# Checking the Fields

In [37]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



# Your first aggregation: counting

In [38]:
df.count()

541909

# Aggregation Functions

## count

In [39]:
df.select(F.count("StockCode").alias('counter')).show()

+-------+
|counter|
+-------+
| 541909|
+-------+



In [40]:
spark.sql("SELECT COUNT(*) AS counter FROM dfTable").show()

+-------+
|counter|
+-------+
| 541909|
+-------+



## Warning

There are a number of gotchas when it comes to null values and
counting. For instance, when performing a count(*) , Spark will
count null values (including rows containing all nulls). However,
when counting an individual column, Spark will not count the null
values.

## countDistinct

In [41]:
df.select(F.countDistinct("StockCode")).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [42]:
spark.sql("SELECT COUNT(DISTINCT StockCode) FROM dfTable").show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



## approx_count_distinct

In [43]:
df.select(F.approx_count_distinct("StockCode", 0.1)).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [44]:
spark.sql("SELECT approx_count_distinct(StockCode,0.1) FROM dfTable").show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



## First and last

In [45]:
df.select(F.first("StockCode"), F.last("StockCode")).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+



In [46]:
spark.sql("SELECT first(StockCode), last(StockCode) FROM dfTable").show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+



## min and max

In [47]:
df.select(F.min("Quantity"), F.max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [48]:
spark.sql("SELECT min(Quantity), max(Quantity) FROM dfTable").show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



## sum

In [49]:
df.select(F.sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [50]:
spark.sql("SELECT sum(Quantity) FROM dfTable").show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



### sumDistinct

In [51]:
df.select(F.sumDistinct("Quantity")).show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [52]:
spark.sql("SELECT SUM(DISTINCT Quantity) FROM dfTable").show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



## avg

In [53]:
df.select(
F.count("Quantity").alias("total_transactions"),
F.sum("Quantity").alias("total_purchases"),
F.avg("Quantity").alias("avg_purchases"),
F.expr("mean(Quantity)").alias("mean_purchases"))\
.selectExpr(
"total_purchases/total_transactions",
    "avg_purchases",
"mean_purchases").show()

+--------------------------------------+----------------+----------------+
|(total_purchases / total_transactions)|   avg_purchases|  mean_purchases|
+--------------------------------------+----------------+----------------+
|                      9.55224954743324|9.55224954743324|9.55224954743324|
+--------------------------------------+----------------+----------------+



In [54]:
spark.sql("""SELECT total_purchases/total_transactions,
                    avg_purchases,
                    mean_purchases 
             FROM (
                    SELECT count(Quantity) AS total_transactions,
                    SUM(Quantity) AS total_purchases,
                    AVG(Quantity) AS avg_purchases,
                    mean(Quantity) AS mean_purchases 
                    FROM dfTable
                  )""").show()

+--------------------------------------+----------------+----------------+
|(total_purchases / total_transactions)|   avg_purchases|  mean_purchases|
+--------------------------------------+----------------+----------------+
|                      9.55224954743324|9.55224954743324|9.55224954743324|
+--------------------------------------+----------------+----------------+



## Variance and Standard Deviation

In [55]:
df.select(F.var_pop("Quantity").alias('populational_variance'), F.var_samp("Quantity").alias('sample_variance'),
F.stddev_pop("Quantity").alias('populational_stddev'), F.stddev_samp("Quantity").alias('sample_stddev')).show()

+---------------------+----------------+-------------------+------------------+
|populational_variance| sample_variance|populational_stddev|     sample_stddev|
+---------------------+----------------+-------------------+------------------+
|     47559.3036466091|47559.3914092988| 218.08095663447807|218.08115785023426|
+---------------------+----------------+-------------------+------------------+



In [56]:
spark.sql("""SELECT var_pop(Quantity) AS populational_variance, 
             var_samp(Quantity) AS sample_variance,
             stddev_pop(Quantity) AS populational_stddev,
             stddev_samp(Quantity) AS sample_stddev
             FROM dfTable
""").show()

+---------------------+----------------+-------------------+------------------+
|populational_variance| sample_variance|populational_stddev|     sample_stddev|
+---------------------+----------------+-------------------+------------------+
|     47559.3036466091|47559.3914092988| 218.08095663447807|218.08115785023426|
+---------------------+----------------+-------------------+------------------+



## skewness and kurtosis

In [57]:
df.select(F.skewness("Quantity"), F.kurtosis("Quantity")).show()

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610529564|119768.05495534712|
+--------------------+------------------+



In [58]:
spark.sql("""SELECT skewness(Quantity),
                    kurtosis(Quantity) 
            FROM dfTable""").show()

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610529564|119768.05495534712|
+--------------------+------------------+



## Covariance and correlation

In [59]:
df.select(F.corr("InvoiceNo", "Quantity"), F.covar_samp("InvoiceNo", "Quantity"),
F.covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085642775E-4|             1052.7280543915654|            1052.7260778754612|
+-------------------------+-------------------------------+------------------------------+



### Covariance matrix

In [60]:
cols = df.columns
df.select(*[F.covar_samp(col1,col2) for col1,col2 in product(cols,cols)]).show()



+--------------------------------+--------------------------------+----------------------------------+-------------------------------+----------------------------------+--------------------------------+---------------------------------+------------------------------+--------------------------------+--------------------------------+----------------------------------+-------------------------------+----------------------------------+--------------------------------+---------------------------------+------------------------------+----------------------------------+----------------------------------+------------------------------------+---------------------------------+------------------------------------+----------------------------------+-----------------------------------+--------------------------------+-------------------------------+-------------------------------+---------------------------------+------------------------------+---------------------------------+----------------------

                                                                                

# Aggregating Complex Types

In [61]:
df.agg(F.collect_set("Country"), F.collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



In [62]:
spark.sql('SELECT collect_set(Country), collect_set(Country) FROM dfTable').show()

+--------------------+--------------------+
|collect_set(Country)|collect_set(Country)|
+--------------------+--------------------+
|[Portugal, Italy,...|[Portugal, Italy,...|
+--------------------+--------------------+



# Grouping

In [63]:
df.groupBy("InvoiceNo", "CustomerId").count().show(5)

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
|   537883|     14437|    5|
|   538068|     17978|   12|
|   538279|     14952|    7|
+---------+----------+-----+
only showing top 5 rows



In [64]:
spark.sql('SELECT InvoiceNo,CustomerId,count(*) AS count FROM dfTable GROUP BY InvoiceNo, CustomerId').show(5)

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
|   537883|     14437|    5|
|   538068|     17978|   12|
|   538279|     14952|    7|
+---------+----------+-----+
only showing top 5 rows



## Grouping with Expressions

In [65]:
df.groupBy("InvoiceNo").agg(
F.count("Quantity").alias("quan"),
F.expr("count(Quantity)")).show(5)

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
+---------+----+---------------+
only showing top 5 rows



## Grouping with Maps

In [66]:
df.groupBy("InvoiceNo").agg(F.expr("avg(Quantity)"),F.expr("stddev_pop(Quantity)"))\
.show(5)

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   536596|               1.5|  1.1180339887498947|
|   536938|33.142857142857146|  20.698023172885524|
|   537252|              31.0|                 0.0|
|   537691|              8.15|   5.597097462078001|
|   538041|              30.0|                 0.0|
+---------+------------------+--------------------+
only showing top 5 rows



In [67]:
spark.sql("SELECT avg(Quantity), stddev_pop(Quantity), InvoiceNo FROM dfTable GROUP BY InvoiceNo").show(5)

+------------------+--------------------+---------+
|     avg(Quantity)|stddev_pop(Quantity)|InvoiceNo|
+------------------+--------------------+---------+
|               1.5|  1.1180339887498947|   536596|
|33.142857142857146|  20.698023172885524|   536938|
|              31.0|                 0.0|   537252|
|              8.15|   5.597097462078001|   537691|
|              30.0|                 0.0|   538041|
+------------------+--------------------+---------+
only showing top 5 rows



# Window Functions

In [68]:
dfWithDate = df.withColumn("date", F.to_date(F.col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")

In [69]:
windowSpec = Window\
.partitionBy("CustomerId", "date")\
.orderBy(F.desc("Quantity"))\
.rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [70]:
maxPurchaseQuantity = F.max(F.col("Quantity")).over(windowSpec)

In [71]:
type(maxPurchaseQuantity)

pyspark.sql.column.Column

In [72]:
purchaseDenseRank = F.dense_rank().over(windowSpec)
purchaseRank = F.rank().over(windowSpec)

In [73]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
#spark.sql("set spark.sql.legacy.timeParserPolicy=CORRECTED")

DataFrame[key: string, value: string]

In [74]:
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
.select(
F.col("CustomerId"),
F.col("date"),
F.col("Quantity"),
purchaseRank.alias("quantityRank"),
purchaseDenseRank.alias("quantityDenseRank"),
maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()

                                                                                

+----------+----------+--------+------------+-----------------+-------------------+
|CustomerId|      date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+------------+-----------------+-------------------+
|     12346|2011-01-18|   74215|           1|                1|              74215|
|     12346|2011-01-18|  -74215|           2|                2|              74215|
|     12347|2010-12-07|      36|           1|                1|                 36|
|     12347|2010-12-07|      30|           2|                2|                 36|
|     12347|2010-12-07|      24|           3|                3|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|                 36|
|     12347|2010-12-07|      12|           4|                4|             

In [75]:
spark.sql("""
                SELECT CustomerId, date, Quantity,
                rank(Quantity) OVER (PARTITION BY CustomerId, date
                ORDER BY Quantity DESC NULLS LAST
                ROWS BETWEEN
                UNBOUNDED PRECEDING AND
                CURRENT ROW) as rank,
                dense_rank(Quantity) OVER (PARTITION BY CustomerId, date
                ORDER BY Quantity DESC NULLS LAST
                ROWS BETWEEN
                UNBOUNDED PRECEDING AND
                CURRENT ROW) as dRank,
                max(Quantity) OVER (PARTITION BY CustomerId, date
                ORDER BY Quantity DESC NULLS LAST
                ROWS BETWEEN
                UNBOUNDED PRECEDING AND
                CURRENT ROW) as maxPurchase
                FROM dfWithDate WHERE CustomerId IS NOT NULL ORDER BY CustomerId
""").show()

+----------+----------+--------+----+-----+-----------+
|CustomerId|      date|Quantity|rank|dRank|maxPurchase|
+----------+----------+--------+----+-----+-----------+
|     12346|2011-01-18|   74215|   1|    1|      74215|
|     12346|2011-01-18|  -74215|   2|    2|      74215|
|     12347|2010-12-07|      36|   1|    1|         36|
|     12347|2010-12-07|      30|   2|    2|         36|
|     12347|2010-12-07|      24|   3|    3|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|     

# Grouping Sets

In [76]:
dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

In [77]:
spark.sql("""
    SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull
    GROUP BY customerId, stockCode
    ORDER BY CustomerId DESC, stockCode DESC
""").show(5)



+----------+---------+-------------+
|CustomerId|stockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85040A|           48|
|     18287|   85039B|          120|
|     18287|   85039A|           96|
|     18287|    84920|            4|
+----------+---------+-------------+
only showing top 5 rows



                                                                                

In [88]:
spark.sql("""
    SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull
    GROUP BY customerId, stockCode GROUPING SETS((customerId, stockCode))
    ORDER BY CustomerId DESC, stockCode DESC
""").show(5)

+----------+---------+-------------+
|customerId|stockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85040A|           48|
|     18287|   85039B|          120|
|     18287|   85039A|           96|
|     18287|    84920|            4|
+----------+---------+-------------+
only showing top 5 rows



## There is no need to decalre customerId and stockCode outside the grouping sets, lets look why

In [96]:
df1 = spark.sql("""
    SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull
    GROUP BY GROUPING SETS((customerId, stockCode))
    ORDER BY CustomerId DESC, stockCode DESC
""")
df2 = spark.sql("""
    SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull
    GROUP BY customerId, stockCode
    ORDER BY CustomerId DESC, stockCode DESC
""")
assert df1.subtract(df2).count() == df2.subtract(df1).count(), 'Different queries'

Note that both queries have produced the same results. This happens because the grouping set is exactly equal to
the columns declared outside it

Obs: Grouping sets depend on null values for aggregation levels. If you
do not filter-out null values, you will get incorrect results. This
applies to cubes, rollups, and grouping sets.

In [47]:
spark.sql("""
SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull
GROUP BY customerId, stockCode GROUPING SETS((customerId, stockCode),())
ORDER BY CustomerId DESC, stockCode DESC
""").show(5)

+----------+---------+-------------+
|customerId|stockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85040A|           48|
|     18287|   85039B|          120|
|     18287|   85039A|           96|
|     18287|    84920|            4|
+----------+---------+-------------+
only showing top 5 rows



## Checking the difference again

In [97]:
df1 = spark.sql("""
SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull
GROUP BY customerId, stockCode GROUPING SETS((customerId, stockCode),())
ORDER BY CustomerId DESC, stockCode DESC
""")

df2 = spark.sql("""
SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull
GROUP BY GROUPING SETS((customerId, stockCode),())
ORDER BY CustomerId DESC, stockCode DESC
""")
assert df1.subtract(df2).count() == df2.subtract(df1).count()

Tip: whenever possible, always choose the less verbose query

# Rollup

In [49]:
rolledUpDF = dfNoNull.rollup("Date", "Country").agg(F.sum("Quantity"))\
.selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")\
.orderBy("Date")
rolledUpDF.show()

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      null|          null|       5176450|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|        France|           449|
|2010-12-01|     Australia|           107|
|2010-12-01|          null|         26814|
|2010-12-01|        Norway|          1852|
|2010-12-01|       Germany|           117|
|2010-12-01|          EIRE|           243|
|2010-12-01|   Netherlands|            97|
|2010-12-02|          EIRE|             4|
|2010-12-02|          null|         21023|
|2010-12-02|United Kingdom|         20873|
|2010-12-02|       Germany|           146|
|2010-12-03|      Portugal|            65|
|2010-12-03|        Poland|           140|
|2010-12-03|       Belgium|           528|
|2010-12-03|United Kingdom|         10439|
|2010-12-03|        France|           239|
|2010-12-03|         Italy|           164|
|2010-12-03|          null|         14830|
+----------

In [53]:
spark.sql(""" SELECT
    Date,
    Country,
    SUM(Quantity) AS total_quantity
FROM
    dfNoNull
GROUP BY
    ROLLUP (Date, Country)
ORDER BY
    Date;
""").show()

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      null|          null|       5176450|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|        France|           449|
|2010-12-01|     Australia|           107|
|2010-12-01|          null|         26814|
|2010-12-01|        Norway|          1852|
|2010-12-01|       Germany|           117|
|2010-12-01|          EIRE|           243|
|2010-12-01|   Netherlands|            97|
|2010-12-02|          EIRE|             4|
|2010-12-02|          null|         21023|
|2010-12-02|United Kingdom|         20873|
|2010-12-02|       Germany|           146|
|2010-12-03|      Portugal|            65|
|2010-12-03|        Poland|           140|
|2010-12-03|       Belgium|           528|
|2010-12-03|United Kingdom|         10439|
|2010-12-03|        France|           239|
|2010-12-03|         Italy|           164|
|2010-12-03|          null|         14830|
+----------

## Getting the grand total

In [51]:
rolledUpDF.where("Date IS NULL").show()

+----+-------+--------------+
|Date|Country|total_quantity|
+----+-------+--------------+
|null|   null|       5176450|
+----+-------+--------------+



# Cube

In [55]:
dfNoNull.cube("Date", "Country").agg(F.sum(F.col("Quantity")))\
.select("Date", "Country", "sum(Quantity)").orderBy("Date").show()



+----+--------------------+-------------+
|Date|             Country|sum(Quantity)|
+----+--------------------+-------------+
|null|               Italy|         7999|
|null|               Spain|        26824|
|null|      United Kingdom|      4263829|
|null|              Sweden|        35637|
|null|                EIRE|       142637|
|null|             Lebanon|          386|
|null|             Iceland|         2458|
|null|             Denmark|         8188|
|null|           Singapore|         5234|
|null|                null|      5176450|
|null|             Germany|       117448|
|null|              Cyprus|         6317|
|null|              France|       110480|
|null|             Austria|         4827|
|null|             Finland|        10666|
|null|        Saudi Arabia|           75|
|null|              Poland|         3653|
|null|           Australia|        83653|
|null|United Arab Emirates|          982|
|null|              Norway|        19247|
+----+--------------------+-------

                                                                                

In [56]:
spark.sql("""
SELECT
    Date,
    Country,
    SUM(Quantity) AS total_quantity
FROM
    dfNoNull
GROUP BY
    CUBE (Date, Country)
ORDER BY
    Date;
""").show()

+----+--------------------+--------------+
|Date|             Country|total_quantity|
+----+--------------------+--------------+
|null|               Italy|          7999|
|null|               Spain|         26824|
|null|      United Kingdom|       4263829|
|null|              Sweden|         35637|
|null|                EIRE|        142637|
|null|             Lebanon|           386|
|null|             Iceland|          2458|
|null|             Denmark|          8188|
|null|           Singapore|          5234|
|null|                null|       5176450|
|null|             Germany|        117448|
|null|              Cyprus|          6317|
|null|              France|        110480|
|null|             Austria|          4827|
|null|             Finland|         10666|
|null|        Saudi Arabia|            75|
|null|              Poland|          3653|
|null|           Australia|         83653|
|null|United Arab Emirates|           982|
|null|              Norway|         19247|
+----+-----

# Grouping Metadata

In [111]:
df_grouped = dfNoNull.cube("customerId", "stockCode").agg(F.grouping_id().alias('grouping_id'), F.sum("Quantity"))\
.orderBy(F.col("grouping_id").desc())

## Getting the grand total by its grouping_id

In [113]:
df_grouped.filter('grouping_id = 3').show()

+----------+---------+-----------+-------------+
|customerId|stockCode|grouping_id|sum(Quantity)|
+----------+---------+-----------+-------------+
|      null|     null|          3|      5176450|
+----------+---------+-----------+-------------+



## Checking the other grouping_id

In [114]:
df_grouped.filter('grouping_id = 2').show(1)

+----------+---------+-----------+-------------+
|customerId|stockCode|grouping_id|sum(Quantity)|
+----------+---------+-----------+-------------+
|      null|    21756|          2|          821|
+----------+---------+-----------+-------------+
only showing top 1 row



In [115]:
df_grouped.filter('grouping_id = 1').show(1)

+----------+---------+-----------+-------------+
|customerId|stockCode|grouping_id|sum(Quantity)|
+----------+---------+-----------+-------------+
|     15574|     null|          1|          349|
+----------+---------+-----------+-------------+
only showing top 1 row



In [116]:
df_grouped.filter('grouping_id = 0').show(1)

[Stage 395:>                                                        (0 + 5) / 5]

+----------+---------+-----------+-------------+
|customerId|stockCode|grouping_id|sum(Quantity)|
+----------+---------+-----------+-------------+
|     12431|   35004C|          0|            6|
+----------+---------+-----------+-------------+
only showing top 1 row



                                                                                

The grouping id logic is as it follows: 
| grouping_id | customerId | stockCode |
| --- | --- | --- |
| 3| 0 | 0 |
| 2| 0 | 1 |
| 1| 1 | 0 |
| 0| 1 | 1 |

# Pivot

In [123]:
pivoted = dfWithDate.groupBy("date").pivot("Country").sum().fillna(0)

In [124]:
pivoted.toPandas()

Unnamed: 0,date,Australia_sum(Quantity),Australia_sum(UnitPrice),Australia_sum(CustomerID),Austria_sum(Quantity),Austria_sum(UnitPrice),Austria_sum(CustomerID),Bahrain_sum(Quantity),Bahrain_sum(UnitPrice),Bahrain_sum(CustomerID),...,USA_sum(CustomerID),United Arab Emirates_sum(Quantity),United Arab Emirates_sum(UnitPrice),United Arab Emirates_sum(CustomerID),United Kingdom_sum(Quantity),United Kingdom_sum(UnitPrice),United Kingdom_sum(CustomerID),Unspecified_sum(Quantity),Unspecified_sum(UnitPrice),Unspecified_sum(CustomerID)
0,2011-01-30,0,0.00,0,0,0.00,0,0,0.0,0,...,0,0,0.0,0,3367,2321.72,11334037,0,0.0,0
1,2011-05-06,0,0.00,0,42,58.95,74484,0,0.0,0,...,0,0,0.0,0,17404,6952.63,18722445,0,0.0,0
2,2011-10-07,0,0.00,0,0,0.00,0,0,0.0,0,...,0,0,0.0,0,25657,12425.41,27566889,0,0.0,0
3,2011-01-23,0,0.00,0,0,0.00,0,0,0.0,0,...,0,0,0.0,0,5068,2551.56,13409810,0,0.0,0
4,2011-07-18,0,0.00,0,0,0.00,0,0,0.0,0,...,0,0,0.0,0,9908,26946.80,16266983,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,2011-04-05,0,0.00,0,0,0.00,0,0,0.0,0,...,0,0,0.0,0,9747,3566.91,13171907,0,0.0,0
301,2011-01-10,96,0.85,12415,0,0.00,0,0,0.0,0,...,0,0,0.0,0,6911,7129.17,9992267,0,0.0,0
302,2011-01-19,113,36.60,161486,0,0.00,0,0,0.0,0,...,0,0,0.0,0,7802,4142.04,11588370,0,0.0,0
303,2011-09-27,0,0.00,0,0,0.00,0,0,0.0,0,...,0,0,0.0,0,20279,6573.45,20291612,0,0.0,0
