In [3]:
from IPython.display import Image
from IPython.core.display import HTML

# Connecting to the database

In [1]:
from db_tools.setup import setup

# Getting a spark session

In [2]:
spark = setup()

23/10/25 20:34:27 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/10/25 20:34:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/25 20:34:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [5]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

# Reminder: SQL execution order

<center><img width=750 src="https://miro.medium.com/v2/resize:fit:749/1*DN0iewN5WFWgrXs5s5cLjQ.jpeg"/></center>

# Example 1: GROUP BY without any function aggregate function - works as a SELECT DISTINCT

In [6]:
query = """
SELECT
   customer_id
FROM
   payment
GROUP BY
   customer_id;
"""

In [7]:
show_query(query)

+-----------+
|customer_id|
+-----------+
|        463|
|        471|
|        496|
|        148|
|        392|
|        540|
|        243|
|        516|
|         31|
|        451|
|        580|
|         85|
|        137|
|        251|
|        458|
|         65|
|        481|
|        588|
|         53|
|        255|
+-----------+
only showing top 20 rows



### Comparison between the above query with and without groupby

In [13]:
query_with = """
SELECT COUNT(customer_id)
FROM (
SELECT
   customer_id
FROM
   payment
GROUP BY
   customer_id)
"""
show_query(query_with)

+------------------+
|count(customer_id)|
+------------------+
|               599|
+------------------+



In [15]:
query_equivalent = """
SELECT COUNT (DISTINCT customer_id) FROM payment;
"""
show_query(query_equivalent)

+---------------------------+
|count(DISTINCT customer_id)|
+---------------------------+
|                        599|
+---------------------------+



In [14]:
query_without = """
SELECT COUNT(customer_id)
FROM payment
"""
show_query(query_without)

+------------------+
|count(customer_id)|
+------------------+
|             14596|
+------------------+



# Example 2: Applying Aggregate functions

In [20]:
query = """
SELECT
	customer_id,
	SUM(amount) AS total
FROM
	payment
GROUP BY
	customer_id
ORDER BY total DESC
"""
show_query(query)

+-----------+------+
|customer_id| total|
+-----------+------+
|        148|211.55|
|        526|208.58|
|        178|194.61|
|        137|191.62|
|        144|189.60|
|        459|183.63|
|        181|167.67|
|        410|167.62|
|        236|166.61|
|        403|162.67|
|        522|161.68|
|        469|158.65|
|        470|157.69|
|        373|156.66|
|        259|154.70|
|        468|154.66|
|        462|152.69|
|        372|152.68|
|        187|151.73|
|        550|151.69|
+-----------+------+
only showing top 20 rows



In [21]:
query = """
SELECT
	first_name || ' ' || last_name full_name,
	SUM (amount) amount
FROM
	payment
INNER JOIN customer USING (customer_id)    	
GROUP BY
	full_name
ORDER BY amount DESC;
"""
show_query(query)

+---------------+------+
|      full_name|amount|
+---------------+------+
|   Eleanor Hunt|211.55|
|      Karl Seal|208.58|
|  Marion Snyder|194.61|
| Rhonda Kennedy|191.62|
|     Clara Shaw|189.60|
|  Tommy Collazo|183.63|
|    Ana Bradley|167.67|
|    Curtis Irby|167.62|
|    Marcia Dean|166.61|
|       Mike Way|162.67|
|  Arnold Havens|161.68|
|    Wesley Bull|158.65|
|  Gordon Allard|157.69|
|    Louis Leone|156.66|
|    Lena Jensen|154.70|
|       Tim Cary|154.66|
| Warren Sherrod|152.69|
|Steve Mackenzie|152.68|
| Brittany Riley|151.73|
|   Guy Brownlee|151.69|
+---------------+------+
only showing top 20 rows



In [23]:
query = """
SELECT
	staff_id,
	COUNT (payment_id)
FROM
	payment
GROUP BY
	staff_id;
"""
show_query(query)

+--------+-----------------+
|staff_id|count(payment_id)|
+--------+-----------------+
|       1|             7292|
|       2|             7304|
+--------+-----------------+



In [24]:
query = """
SELECT 
	customer_id, 
	staff_id, 
	SUM(amount) 
FROM 
	payment
GROUP BY 
	staff_id, 
	customer_id
ORDER BY 
    customer_id;
"""
show_query(query)

+-----------+--------+-----------+
|customer_id|staff_id|sum(amount)|
+-----------+--------+-----------+
|          1|       1|      60.85|
|          1|       2|      53.85|
|          2|       2|      67.88|
|          2|       1|      55.86|
|          3|       2|      70.88|
|          3|       1|      59.88|
|          4|       1|      49.88|
|          4|       2|      31.90|
|          5|       2|      70.79|
|          5|       1|      63.86|
|          6|       1|      53.85|
|          6|       2|      30.90|
|          7|       1|      69.84|
|          7|       2|      60.88|
|          8|       1|      57.86|
|          8|       2|      27.91|
|          9|       1|      39.88|
|          9|       2|      38.92|
|         10|       2|      53.88|
|         10|       1|      40.88|
+-----------+--------+-----------+
only showing top 20 rows



### Applying GROUP BY with DATE

In [26]:
query = """
SELECT 
	DATE(payment_date) paid_date, 
	SUM(amount) sum
FROM 
	payment
GROUP BY
	DATE(payment_date);
"""
show_query(query)

+----------+-------+
| paid_date|    sum|
+----------+-------+
|2007-03-22|2586.79|
|2007-03-17|2442.16|
|2007-04-27|2673.57|
|2007-03-02|2550.05|
|2007-04-30|5723.89|
|2007-04-26| 347.21|
|2007-04-12|1930.48|
|2007-02-18|1275.98|
|2007-02-14| 116.73|
|2007-03-19|2617.69|
|2007-02-20|1219.09|
|2007-02-19|1290.90|
|2007-04-11|1940.32|
|2007-04-07|1984.28|
|2007-02-17|1188.17|
|2007-04-29|2717.60|
|2007-03-18|2701.76|
|2007-05-14| 514.18|
|2007-03-16| 299.28|
|2007-04-28|2622.73|
+----------+-------+
only showing top 20 rows



# Example 3: Filters with GROUP BY? Use the HAVING clause

In [28]:
query = """
SELECT
	customer_id,
	SUM (amount)
FROM
	payment
GROUP BY
	customer_id
HAVING
	SUM (amount) > 200;
"""
show_query(query)

+-----------+-----------+
|customer_id|sum(amount)|
+-----------+-----------+
|        148|     211.55|
|        526|     208.58|
+-----------+-----------+



In [29]:
query = """SELECT
	store_id,
	COUNT (customer_id)
FROM
	customer
GROUP BY
	store_id
HAVING
	COUNT (customer_id) > 300;"""
show_query(query)

+--------+------------------+
|store_id|count(customer_id)|
+--------+------------------+
|       1|               326|
+--------+------------------+



# Exercise: redo all the above with Pyspark API