In [1]:
from db_tools.setup import setup

In [2]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [3]:
spark = setup()

23/12/30 00:12:00 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/12/30 00:12:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/12/30 00:12:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [4]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

In [15]:
brand_schema = StructType([ \
    StructField("brand",StringType(),True), \
    StructField("segment",StringType(),True), \
    StructField("quantity",StringType(),True)
  ])
brands =   [('ABC','Premium',100),
            ('ABC','Basic',200),
            ('XYZ','Premium',100),
            ('XYZ','Basic',300)]
columns = ['brand','segment','quantity']
rdd = spark.sparkContext.parallelize(brands)
spark.createDataFrame(rdd, schema = brand_schema).createOrReplaceTempView('sales')

In [17]:
df = spark.sql("SELECT * FROM sales;")
df.show()

+-----+-------+--------+
|brand|segment|quantity|
+-----+-------+--------+
|  ABC|Premium|     100|
|  ABC|  Basic|     200|
|  XYZ|Premium|     100|
|  XYZ|  Basic|     300|
+-----+-------+--------+



# ROLLUP with pyspark API

In [18]:
df.select(['brand','segment','quantity']).\
rollup(['brand','segment']).\
agg({'quantity':'sum'}).\
orderBy(['brand','segment']).show()

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
| null|   null|        700.0|
|  ABC|   null|        300.0|
|  ABC|  Basic|        200.0|
|  ABC|Premium|        100.0|
|  XYZ|   null|        400.0|
|  XYZ|  Basic|        300.0|
|  XYZ|Premium|        100.0|
+-----+-------+-------------+



In [19]:
df.select(['brand','segment','quantity']).\
rollup(['segment','brand']).\
agg({'quantity':'sum'}).\
orderBy(['brand','segment']).show()

+-------+-----+-------------+
|segment|brand|sum(quantity)|
+-------+-----+-------------+
|   null| null|        700.0|
|  Basic| null|        500.0|
|Premium| null|        200.0|
|  Basic|  ABC|        200.0|
|Premium|  ABC|        100.0|
|  Basic|  XYZ|        300.0|
|Premium|  XYZ|        100.0|
+-------+-----+-------------+



In [41]:
df.select(['brand','segment','quantity']).\
rollup(['segment','brand']).\
agg({'quantity':'sum'}).\
dropna(subset = 'segment').orderBy(['brand','segment']).show()

+-------+-----+-------------+
|segment|brand|sum(quantity)|
+-------+-----+-------------+
|  Basic| null|        500.0|
|Premium| null|        200.0|
|  Basic|  ABC|        200.0|
|Premium|  ABC|        100.0|
|  Basic|  XYZ|        300.0|
|Premium|  XYZ|        100.0|
+-------+-----+-------------+



In [23]:
query = '''
SELECT
    EXTRACT (YEAR FROM rental_date) AS y,
    EXTRACT (MONTH FROM rental_date) AS M,
    EXTRACT (DAY FROM rental_date) AS d,
    COUNT (rental_id)
FROM
    rental
GROUP BY
    ROLLUP (
        EXTRACT (YEAR FROM rental_date),
        EXTRACT (MONTH FROM rental_date),
        EXTRACT (DAY FROM rental_date)
    );

'''
show_query(query)

['brand', 'segment', 'quantity']

In [33]:
spark.sql('SELECT * FROM rental').select(
          F.year(F.col('rental_date')).alias('y'),
          F.month(F.col('rental_date')).alias('M'),
          F.dayofyear(F.col('rental_date')).alias('d'),
          F.col('rental_id')
        ).rollup(['y','M','d']).agg({'rental_id':'count'}).show()


+----+----+----+----------------+
|   y|   M|   d|count(rental_id)|
+----+----+----+----------------+
|2005|   5| 148|             196|
|null|null|null|           16044|
|2005|   6| 169|             344|
|2005|   7| 193|             495|
|2005|   8| 231|             628|
|2005|   5| 147|             166|
|2005|   6| 167|             324|
|2005|   7| 186|              27|
|2005|   6| 172|             275|
|2005|   8| 233|             659|
|2005|   7| 190|             513|
|2005|   7| 188|             461|
|2005|   7| 192|             461|
|2005|   7| 210|             641|
|2005|   7| 207|              33|
|2005|   7| 208|             649|
|2005|   8| 214|             643|
|2005|   8| 228|              23|
|2005|   7| 212|             679|
|2005|   8|null|            5686|
+----+----+----+----------------+
only showing top 20 rows



# CUBE

In [37]:
df.select(*['brand','segment','quantity'])\
.cube(['brand','segment'])\
.agg({'quantity':'sum'})\
.orderBy(['brand','segment']).show()

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
| null|   null|        700.0|
| null|  Basic|        500.0|
| null|Premium|        200.0|
|  ABC|   null|        300.0|
|  ABC|  Basic|        200.0|
|  ABC|Premium|        100.0|
|  XYZ|   null|        400.0|
|  XYZ|  Basic|        300.0|
|  XYZ|Premium|        100.0|
+-----+-------+-------------+



# Partial cube

In [42]:
df.select(*['brand','segment','quantity'])\
.cube(['brand','segment'])\
.agg({'quantity':'sum'})\
.dropna(subset = 'brand').orderBy(['brand','segment']).show()

+-----+-------+-------------+
|brand|segment|sum(quantity)|
+-----+-------+-------------+
|  ABC|   null|        300.0|
|  ABC|  Basic|        200.0|
|  ABC|Premium|        100.0|
|  XYZ|   null|        400.0|
|  XYZ|  Basic|        300.0|
|  XYZ|Premium|        100.0|
+-----+-------+-------------+

