In [2]:
from db_tools.setup import setup
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [3]:
spark = setup()

23/10/29 12:49:14 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/10/29 12:49:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/29 12:49:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [4]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

# CASE

## General example

In [5]:
query = """
SELECT title,
       length,
       CASE
           WHEN length> 0
                AND length <= 50 THEN 'Short'
           WHEN length > 50
                AND length <= 120 THEN 'Medium'
           WHEN length> 120 THEN 'Long'
       END duration
FROM film
ORDER BY title;
"""
show_query(query)

+-------------------+------+--------+
|              title|length|duration|
+-------------------+------+--------+
|   Academy Dinosaur|    86|  Medium|
|     Ace Goldfinger|    48|   Short|
|   Adaptation Holes|    50|   Short|
|   Affair Prejudice|   117|  Medium|
|        African Egg|   130|    Long|
|       Agent Truman|   169|    Long|
|    Airplane Sierra|    62|  Medium|
|    Airport Pollock|    54|  Medium|
|      Alabama Devil|   114|  Medium|
|   Aladdin Calendar|    63|  Medium|
|    Alamo Videotape|   126|    Long|
|     Alaska Phantom|   136|    Long|
|        Ali Forever|   150|    Long|
|     Alice Fantasia|    94|  Medium|
|       Alien Center|    46|   Short|
|    Alley Evolution|   180|    Long|
|         Alone Trip|    82|  Medium|
|      Alter Victory|    57|  Medium|
|       Amadeus Holy|   113|  Medium|
|Amelie Hellfighters|    79|  Medium|
+-------------------+------+--------+
only showing top 20 rows



## CASE WHEN with aggregates

In [13]:
query = """SELECT
	SUM (CASE 
         WHEN rental_rate = 0.99 THEN 1 
         ELSE 0
         END
         ) AS Economy,
	SUM (
		CASE
		WHEN rental_rate = 2.99 THEN 1
		ELSE 0
		END
	) AS Mass,
	SUM (
		CASE
		WHEN rental_rate = 4.99 THEN 1
		ELSE 0
		END
	) AS Premium
FROM
	film;"""
show_query(query)

+-------+----+-------+
|Economy|Mass|Premium|
+-------+----+-------+
|    341| 323|    336|
+-------+----+-------+



## More examples

In [14]:
query = """
SELECT title,
       rating,
       CASE rating
           WHEN 'G' THEN 'General Audiences'
           WHEN 'PG' THEN 'Parental Guidance Suggested'
           WHEN 'PG-13' THEN 'Parents Strongly Cautioned'
           WHEN 'R' THEN 'Restricted'
           WHEN 'NC-17' THEN 'Adults Only'
       END rating_description
FROM film
ORDER BY title;
"""
show_query(query)

+-------------------+------+--------------------+
|              title|rating|  rating_description|
+-------------------+------+--------------------+
|   Academy Dinosaur|    PG|Parental Guidance...|
|     Ace Goldfinger|     G|   General Audiences|
|   Adaptation Holes| NC-17|         Adults Only|
|   Affair Prejudice|     G|   General Audiences|
|        African Egg|     G|   General Audiences|
|       Agent Truman|    PG|Parental Guidance...|
|    Airplane Sierra| PG-13|Parents Strongly ...|
|    Airport Pollock|     R|          Restricted|
|      Alabama Devil| PG-13|Parents Strongly ...|
|   Aladdin Calendar| NC-17|         Adults Only|
|    Alamo Videotape|     G|   General Audiences|
|     Alaska Phantom|    PG|Parental Guidance...|
|        Ali Forever|    PG|Parental Guidance...|
|     Alice Fantasia| NC-17|         Adults Only|
|       Alien Center| NC-17|         Adults Only|
|    Alley Evolution| NC-17|         Adults Only|
|         Alone Trip|     R|          Restricted|


In [19]:
query = """
SELECT
       SUM(CASE rating
             WHEN 'G' THEN 1 
		     ELSE 0 
		   END) AS General_Audiences,
       SUM(CASE rating
             WHEN 'PG' THEN 1 
		     ELSE 0 
		   END) AS Parental_Guidance_Suggested,
       SUM(CASE rating
             WHEN 'PG-13' THEN 1 
		     ELSE 0 
		   END) AS Parents_Strongly_Cautioned,
       SUM(CASE rating
             WHEN 'R' THEN 1 
		     ELSE 0 
		   END) AS Restricted,
       SUM(CASE rating
             WHEN 'NC-17' THEN 1 
		     ELSE 0 
		   END) AS Adults_Only
FROM film;
"""
show_query(query)

+-----------------+---------------------------+--------------------------+----------+-----------+
|General_Audiences|Parental_Guidance_Suggested|Parents_Strongly_Cautioned|Restricted|Adults_Only|
+-----------------+---------------------------+--------------------------+----------+-----------+
|              178|                        194|                       223|       195|        210|
+-----------------+---------------------------+--------------------------+----------+-----------+



# COALESCE

In [20]:
query = """SELECT
	COALESCE (1, 2);"""
show_query(query)

+--------------+
|coalesce(1, 2)|
+--------------+
|             1|
+--------------+



In [21]:
query = """SELECT
	COALESCE (NULL, 2 , 1);"""
show_query(query)

+--------------------+
|coalesce(NULL, 2, 1)|
+--------------------+
|                   2|
+--------------------+



# IFNULL

In [26]:
query = """SELECT IFNULL(NULL, array('2'));"""
show_query(query)

+----------------------+
|ifnull(NULL, array(2))|
+----------------------+
|                   [2]|
+----------------------+



In [34]:
query = """SELECT IFNULL(1, 2);"""
show_query(query)

+------------+
|ifnull(1, 2)|
+------------+
|           1|
+------------+



# NULLIF

In [32]:
query = """SELECT nullif(2, 2);"""
show_query(query)

+------------+
|nullif(2, 2)|
+------------+
|        null|
+------------+



In [35]:
query = """SELECT nullif(3, 2);"""
show_query(query)

+------------+
|nullif(3, 2)|
+------------+
|           3|
+------------+



# CAST

In [29]:
query = """SELECT rental_id,
                  CAST(rental_date AS DATE) AS rental_date_as_date,
                  CAST(rental_date AS TIMESTAMP) AS rental_date_as_timestamp
                  FROM rental"""
show_query(query)

+---------+-------------------+------------------------+
|rental_id|rental_date_as_date|rental_date_as_timestamp|
+---------+-------------------+------------------------+
|        2|         2005-05-24|     2005-05-24 22:54:33|
|        3|         2005-05-24|     2005-05-24 23:03:39|
|        4|         2005-05-24|     2005-05-24 23:04:41|
|        5|         2005-05-24|     2005-05-24 23:05:21|
|        6|         2005-05-24|     2005-05-24 23:08:07|
|        7|         2005-05-24|     2005-05-24 23:11:53|
|        8|         2005-05-24|     2005-05-24 23:31:46|
|        9|         2005-05-25|     2005-05-25 00:00:40|
|       10|         2005-05-25|     2005-05-25 00:02:21|
|       11|         2005-05-25|     2005-05-25 00:09:02|
|       12|         2005-05-25|     2005-05-25 00:19:27|
|       13|         2005-05-25|     2005-05-25 00:22:55|
|       14|         2005-05-25|     2005-05-25 00:31:15|
|       15|         2005-05-25|     2005-05-25 00:39:22|
|       16|         2005-05-25|

# Exercise: redo all the above with PysparkSQL API