In [32]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [33]:
orders_df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header","true") \
.load("/public/trendytech/datasets/order_data.csv")

In [34]:
orders_df.show()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
|   536378|   85071B|RED CHARLIE+LOLA ...|      96|01-12-2010 9.37|     0.38|     14688|United Kingdom|
|   536378|    21931|JUMBO STORAGE BAG...|      10|01-12-2010 9.

In [35]:
from pyspark.sql.functions import *

In [36]:
## programmatic approach
summary_df = orders_df.groupBy("country","InvoiceNo") \
.agg(sum("quantity").alias("total_quantity"),sum(expr("quantity * unitprice")).alias("invoice_value")).sort("InvoiceNo")

In [37]:
summary_df

country,InvoiceNo,total_quantity,invoice_value
United Kingdom,536378,242,192.78000000000003
United Kingdom,536380,24,34.8
United Kingdom,536381,198,449.98
United Kingdom,536382,134,430.6
United Kingdom,536384,190,489.6
United Kingdom,536385,53,130.85
United Kingdom,536386,236,508.2000000000001
United Kingdom,536387,1440,3193.92
United Kingdom,536388,108,226.14
Australia,536389,107,358.25


In [38]:
## somehow programmatic approach

summary_df1 = orders_df .groupBy("country","invoiceno").agg(expr("sum(quantity) as total_quantity"),expr("sum(quantity * unitprice) as invoice_value")).sort("invoiceno")

In [39]:
summary_df1

country,invoiceno,total_quantity,invoice_value
United Kingdom,536378,242,192.78000000000003
United Kingdom,536380,24,34.8
United Kingdom,536381,198,449.98
United Kingdom,536382,134,430.6
United Kingdom,536384,190,489.6
United Kingdom,536385,53,130.85
United Kingdom,536386,236,508.2000000000001
United Kingdom,536387,1440,3193.92
United Kingdom,536388,108,226.14
Australia,536389,107,358.25


In [40]:
orders_df.createOrReplaceTempView("orders")

In [47]:

spark.sql("""
select country, invoiceno, sum(quantity) as total_quantity, sum(quantity * unitprice) as invoice_value from orders group by country, invoiceno order by invoiceno
""").show()

+--------------+---------+--------------+------------------+
|       country|invoiceno|total_quantity|     invoice_value|
+--------------+---------+--------------+------------------+
|United Kingdom|   536378|           242|192.78000000000003|
|United Kingdom|   536380|            24|              34.8|
|United Kingdom|   536381|           198|449.97999999999996|
|United Kingdom|   536382|           134|430.59999999999997|
|United Kingdom|   536384|           190|             489.6|
|United Kingdom|   536385|            53|            130.85|
|United Kingdom|   536386|           236|508.20000000000005|
|United Kingdom|   536387|          1440|           3193.92|
|United Kingdom|   536388|           108|            226.14|
|     Australia|   536389|           107|            358.25|
|United Kingdom|   536390|          1568|           1825.74|
|United Kingdom|   536392|           103|318.14000000000004|
|United Kingdom|   536393|             8|              79.6|
|United Kingdom|   53639

In [28]:
spark.stop()