# Connecting to the database

In [147]:
from db_tools.setup import setup

# Getting a SparkSession

In [3]:
spark = setup()

23/10/22 13:13:09 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/10/22 13:13:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/22 13:13:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [4]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

# Example 1: Filtering names from the Database

In [5]:
query1 = """SELECT
            	last_name,
            	first_name
            FROM
            	customer
            WHERE
            	first_name = 'Jamie';"""

In [6]:
show_query(query1)

+---------+----------+
|last_name|first_name|
+---------+----------+
|     Rice|     Jamie|
|    Waugh|     Jamie|
+---------+----------+



# Example 2: Filtering names from the Database with logical operators

In [7]:
query2 = """SELECT
                last_name,
            	first_name
            FROM customer
            WHERE first_name = 'Jamie' AND 
            last_name = 'Rice';"""

In [8]:
show_query(query2)

+---------+----------+
|last_name|first_name|
+---------+----------+
|     Rice|     Jamie|
+---------+----------+



In [9]:
query3 = """SELECT
            	first_name,
            	last_name
            FROM
            	customer
            WHERE
            	last_name = 'Rodriguez' OR 
            	first_name = 'Adam';"""

In [10]:
show_query(query3)

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Laura|Rodriguez|
|      Adam|    Gooch|
+----------+---------+



# Example 3: Filtering names from the Database with the IN operator

In [11]:
query4 = """SELECT
	first_name,
	last_name
FROM
	customer
WHERE 
	first_name IN ('Ann','Anne','Annie');"""

In [12]:
show_query(query4)

+----------+---------+
|first_name|last_name|
+----------+---------+
|       Ann|    Evans|
|      Anne|   Powell|
|     Annie|  Russell|
+----------+---------+



In [13]:
query5 = """SELECT
	first_name,
	last_name
FROM
	customer
WHERE 
	first_name LIKE 'Ann%'"""

In [14]:
show_query(query5)

+----------+---------+
|first_name|last_name|
+----------+---------+
|      Anna|     Hill|
|       Ann|    Evans|
|      Anne|   Powell|
|     Annie|  Russell|
|   Annette|    Olson|
+----------+---------+



# Example 4: combining wildcards, string functions and BETWEEN clause

In [15]:
query6 = """SELECT
            	first_name,
            	LENGTH(first_name) name_length
            FROM
            	customer
            WHERE 
            	first_name LIKE 'A%' AND
            	LENGTH(first_name) BETWEEN 3 AND 5
            ORDER BY
            	name_length;
            """
show_query(query6)

+----------+-----------+
|first_name|name_length|
+----------+-----------+
|       Amy|          3|
|       Ann|          3|
|       Ana|          3|
|      Anna|          4|
|      Anne|          4|
|      Alma|          4|
|      Adam|          4|
|      Alan|          4|
|      Alex|          4|
|      Andy|          4|
|     Alice|          5|
|     Annie|          5|
|     Anita|          5|
|     Amber|          5|
|     April|          5|
|     Agnes|          5|
|     Aaron|          5|
|     Allen|          5|
|     Alvin|          5|
|     Angel|          5|
+----------+-----------+
only showing top 20 rows



# Example 5: searching with the <> operator

In [18]:
query7a = """SELECT 
                  first_name, 
                  last_name
            FROM customer 
            WHERE first_name LIKE 'Bra%' AND 
        	last_name <> 'Motley';
         """

In [19]:
show_query(query7a)

+----------+---------+
|first_name|last_name|
+----------+---------+
|    Brandy|   Graves|
|   Brandon|     Huey|
|      Brad|  Mccurdy|
+----------+---------+



## != is also accepted

In [21]:
query7b = """SELECT 
                  first_name, 
                  last_name
            FROM customer 
            WHERE first_name LIKE 'Bra%' AND 
        	last_name != 'Motley';
         """

In [22]:
show_query(query7b)

+----------+---------+
|first_name|last_name|
+----------+---------+
|    Brandy|   Graves|
|   Brandon|     Huey|
|      Brad|  Mccurdy|
+----------+---------+



# Example 6: Using LIMIT to constraint the number of results

In [23]:
query8 = """SELECT
                	film_id,
                	title,
                	release_year
            FROM film
            ORDER BY film_id
            LIMIT 5;
        """
show_query(query8)

+-------+----------------+------------+
|film_id|           title|release_year|
+-------+----------------+------------+
|      1|Academy Dinosaur|        2006|
|      2|  Ace Goldfinger|        2006|
|      3|Adaptation Holes|        2006|
|      4|Affair Prejudice|        2006|
|      5|     African Egg|        2006|
+-------+----------------+------------+



# Example 7: Using LIMIT and Offset to constraint and retrieve results from a specific position

In [24]:
query8 = """SELECT
            	film_id,
            	title,
            	release_year
            FROM
            	film
            ORDER BY
            	film_id
            LIMIT 4
            OFFSET 3;
                    """
show_query(query8)

+-------+----------------+------------+
|film_id|           title|release_year|
+-------+----------------+------------+
|      4|Affair Prejudice|        2006|
|      5|     African Egg|        2006|
|      6|    Agent Truman|        2006|
|      7| Airplane Sierra|        2006|
+-------+----------------+------------+



# Example 8: Using Limit to get the table tail

In [30]:
query9 = """SELECT
                film_id,
                title,
                rental_rate
            FROM film
            ORDER BY rental_rate DESC
            LIMIT 10;"""
show_query(query9)

+-------+-------------------+-----------+
|film_id|              title|rental_rate|
+-------+-------------------+-----------+
|      7|    Airplane Sierra|       4.99|
|    384|   Grosse Wonderful|       4.99|
|      8|    Airport Pollock|       4.99|
|     98|  Bright Encounters|       4.99|
|      2|     Ace Goldfinger|       4.99|
|    133|    Chamber Italian|       4.99|
|     10|   Aladdin Calendar|       4.99|
|     13|        Ali Forever|       4.99|
|     20|Amelie Hellfighters|       4.99|
|     21|    American Circus|       4.99|
+-------+-------------------+-----------+



# More IN operator examples

### Most common case

In [32]:
query10 = """SELECT customer_id,
                	rental_id,
                	return_date
                FROM
                	rental
                WHERE
                	customer_id IN (1, 2)
                ORDER BY
                	return_date DESC;
                 """

In [33]:
show_query(query10)

+-----------+---------+-------------------+
|customer_id|rental_id|        return_date|
+-----------+---------+-------------------+
|          2|    15145|2005-08-31 15:51:04|
|          1|    15315|2005-08-30 01:51:46|
|          2|    14743|2005-08-29 00:18:56|
|          1|    15298|2005-08-28 22:49:37|
|          2|    14475|2005-08-27 08:59:32|
|          1|    14825|2005-08-27 07:01:57|
|          2|    15907|2005-08-25 23:23:35|
|          2|    12963|2005-08-23 11:37:04|
|          1|    13176|2005-08-23 08:50:54|
|          1|    14762|2005-08-23 01:30:57|
|          1|    12250|2005-08-22 23:05:29|
|          1|    13068|2005-08-20 14:44:16|
|          2|    11614|2005-08-20 07:04:18|
|          1|    11824|2005-08-19 10:11:54|
|          1|    11299|2005-08-10 16:40:52|
|          1|    10437|2005-08-10 12:12:04|
|          2|    11177|2005-08-10 10:55:48|
|          2|    11087|2005-08-10 10:37:41|
|          2|     9236|2005-08-08 18:52:43|
|          2|     9296|2005-08-0

### What IN does in the background

In [36]:
query11 = """SELECT
                	rental_id,
                	customer_id,
                	return_date
              FROM
                	rental
              WHERE
                	customer_id = 1 OR customer_id = 2
              ORDER BY
                	return_date DESC;"""

In [37]:
show_query(query11)

+---------+-----------+-------------------+
|rental_id|customer_id|        return_date|
+---------+-----------+-------------------+
|    15145|          2|2005-08-31 15:51:04|
|    15315|          1|2005-08-30 01:51:46|
|    14743|          2|2005-08-29 00:18:56|
|    15298|          1|2005-08-28 22:49:37|
|    14475|          2|2005-08-27 08:59:32|
|    14825|          1|2005-08-27 07:01:57|
|    15907|          2|2005-08-25 23:23:35|
|    12963|          2|2005-08-23 11:37:04|
|    13176|          1|2005-08-23 08:50:54|
|    14762|          1|2005-08-23 01:30:57|
|    12250|          1|2005-08-22 23:05:29|
|    13068|          1|2005-08-20 14:44:16|
|    11614|          2|2005-08-20 07:04:18|
|    11824|          1|2005-08-19 10:11:54|
|    11299|          1|2005-08-10 16:40:52|
|    10437|          1|2005-08-10 12:12:04|
|    11177|          2|2005-08-10 10:55:48|
|    11087|          2|2005-08-10 10:37:41|
|     9236|          2|2005-08-08 18:52:43|
|     9296|          2|2005-08-0

### IN is compatible with NOT

In [38]:
query12 =  """SELECT
                	customer_id,
                	rental_id,
                	return_date
              FROM
                	rental
              WHERE
                	customer_id NOT IN (1, 2);
"""
show_query(query12)

+-----------+---------+-------------------+
|customer_id|rental_id|        return_date|
+-----------+---------+-------------------+
|        459|        2|2005-05-28 19:40:33|
|        408|        3|2005-06-01 22:12:39|
|        333|        4|2005-06-03 01:43:41|
|        222|        5|2005-06-02 04:33:21|
|        549|        6|2005-05-27 01:32:07|
|        269|        7|2005-05-29 20:34:53|
|        239|        8|2005-05-27 23:33:46|
|        126|        9|2005-05-28 00:22:40|
|        399|       10|2005-05-31 22:44:21|
|        142|       11|2005-06-02 20:56:02|
|        261|       12|2005-05-30 05:44:27|
|        334|       13|2005-05-30 04:28:55|
|        446|       14|2005-05-26 02:56:15|
|        319|       15|2005-06-03 03:30:22|
|        316|       16|2005-05-26 04:42:11|
|        575|       17|2005-05-27 00:43:36|
|         19|       18|2005-05-31 06:35:47|
|        456|       19|2005-05-31 06:00:24|
|        185|       20|2005-05-27 02:20:41|
|        388|       21|2005-05-2

### NOT IN equivalent. Scales poorly

In [39]:
query13 = """SELECT
            	customer_id,
            	rental_id,
            	return_date
            FROM
            	rental
            WHERE
            	customer_id <> 1
            AND customer_id <> 2;"""
show_query(query13)

+-----------+---------+-------------------+
|customer_id|rental_id|        return_date|
+-----------+---------+-------------------+
|        459|        2|2005-05-28 19:40:33|
|        408|        3|2005-06-01 22:12:39|
|        333|        4|2005-06-03 01:43:41|
|        222|        5|2005-06-02 04:33:21|
|        549|        6|2005-05-27 01:32:07|
|        269|        7|2005-05-29 20:34:53|
|        239|        8|2005-05-27 23:33:46|
|        126|        9|2005-05-28 00:22:40|
|        399|       10|2005-05-31 22:44:21|
|        142|       11|2005-06-02 20:56:02|
|        261|       12|2005-05-30 05:44:27|
|        334|       13|2005-05-30 04:28:55|
|        446|       14|2005-05-26 02:56:15|
|        319|       15|2005-06-03 03:30:22|
|        316|       16|2005-05-26 04:42:11|
|        575|       17|2005-05-27 00:43:36|
|         19|       18|2005-05-31 06:35:47|
|        456|       19|2005-05-31 06:00:24|
|        185|       20|2005-05-27 02:20:41|
|        388|       21|2005-05-2

### Extra: using IN with a subquery

In [41]:
query14 = """SELECT
                	customer_id,
                	first_name,
                	last_name
             FROM
                	customer
             WHERE
                	customer_id IN (
                		SELECT customer_id
                		FROM rental
                		WHERE CAST (return_date AS DATE) = '2005-05-27'
                	)
             ORDER BY customer_id;"""
show_query(query14)

+-----------+----------+----------+
|customer_id|first_name| last_name|
+-----------+----------+----------+
|         37|    Pamela|     Baker|
|         47|   Frances|    Parker|
|         48|       Ann|     Evans|
|         65|      Rose|    Howard|
|         73|   Beverly|    Brooks|
|         75|     Tammy|   Sanders|
|         93|   Phyllis|    Foster|
|        114|     Grace|     Ellis|
|        119|    Sherry|  Marshall|
|        131|    Monica|     Hicks|
|        158|  Veronica|     Stone|
|        167|     Sally|    Pierce|
|        182|     Renee|      Lane|
|        184|    Vivian|      Ruiz|
|        185|   Roberta|    Harper|
|        211|    Stacey|Montgomery|
|        239|    Minnie|    Romero|
|        247|    Stella|    Moreno|
|        251|    Vickie|    Brewer|
|        256|     Mabel|   Holland|
+-----------+----------+----------+
only showing top 20 rows



# More BETWEEN operator examples

### Classic use

In [42]:
query15 = """SELECT
                	customer_id,
                	payment_id,
                	amount
             FROM
                	payment
             WHERE
                	amount BETWEEN 8 AND 9;"""
show_query(query15)

+-----------+----------+------+
|customer_id|payment_id|amount|
+-----------+----------+------+
|        343|     17517|  8.99|
|        347|     17529|  8.99|
|        347|     17532|  8.99|
|        348|     17535|  8.99|
|        349|     17540|  8.99|
|        379|     17648|  8.99|
|        403|     17747|  8.99|
|        409|     17775|  8.99|
|        423|     17817|  8.99|
|        431|     17853|  8.99|
|        442|     17886|  8.99|
|        465|     17990|  8.99|
|        466|     17993|  8.99|
|        467|     17997|  8.99|
|        468|     18002|  8.99|
|        474|     18027|  8.99|
|        478|     18040|  8.99|
|        483|     18059|  8.99|
|        485|     18065|  8.99|
|        497|     18099|  8.99|
+-----------+----------+------+
only showing top 20 rows



### BETWEEN is also compatible with NOT

In [43]:
query15 = """SELECT
                	customer_id,
                	payment_id,
                	amount
             FROM
                	payment
             WHERE
                	amount NOT BETWEEN 8 AND 9;"""
show_query(query15)

+-----------+----------+------+
|customer_id|payment_id|amount|
+-----------+----------+------+
|        341|     17503|  7.99|
|        341|     17504|  1.99|
|        341|     17505|  7.99|
|        341|     17506|  2.99|
|        341|     17507|  7.99|
|        341|     17508|  5.99|
|        342|     17509|  5.99|
|        342|     17510|  5.99|
|        342|     17511|  2.99|
|        343|     17512|  4.99|
|        343|     17513|  6.99|
|        343|     17514|  0.99|
|        343|     17515|  0.99|
|        343|     17516|  6.99|
|        343|     17518|  0.99|
|        344|     17519|  3.99|
|        344|     17520|  4.99|
|        344|     17521|  0.99|
|        345|     17522|  0.99|
|        345|     17523|  4.99|
+-----------+----------+------+
only showing top 20 rows



### Using BETWEEN to query dates. You must follow the ISO 8601 format i.e., YYYY-MM-DD.

In [44]:
query16 =  """SELECT
                	customer_id,
                	payment_id,
                	amount,
                    payment_date
              FROM payment
              WHERE payment_date BETWEEN '2007-02-07' AND '2007-02-15';
              """
show_query(query16)

+-----------+----------+------+--------------------+
|customer_id|payment_id|amount|        payment_date|
+-----------+----------+------+--------------------+
|        368|     17610|  0.99|2007-02-14 23:25:...|
|        370|     17617|  6.99|2007-02-14 23:33:...|
|        402|     17743|  4.99|2007-02-14 23:53:...|
|        416|     17793|  2.99|2007-02-14 21:21:...|
|        432|     17854|  5.99|2007-02-14 23:07:...|
|        481|     18051|  2.99|2007-02-14 22:03:...|
|        512|     18155|  6.99|2007-02-14 22:57:...|
|        516|     18173|  4.99|2007-02-14 21:23:...|
|        546|     18276|  1.99|2007-02-14 23:10:...|
|        561|     18322|  2.99|2007-02-14 23:52:...|
|        592|     18441|  6.99|2007-02-14 21:41:...|
|        595|     18456|  2.99|2007-02-14 22:16:...|
|          1|     18495|  5.99|2007-02-14 23:22:...|
|         46|     18686|  4.99|2007-02-14 21:45:...|
|         49|     18698|  0.99|2007-02-14 21:44:...|
|         95|     18870|  2.99|2007-02-14 22:4

# More LIKE examples

### Like wildcards

```
SELECT
	'foo' LIKE 'foo', -- true
	'foo' LIKE 'f%', -- true
	'foo' LIKE '_o_', -- true
	'bar' LIKE 'b_'; -- false
```

In [45]:
query17 =  """SELECT
                	first_name,
                    last_name
              FROM
                	customer
              WHERE
                	first_name LIKE '%er%'
              ORDER BY 
                    first_name;
              """
show_query(query17)

+-----------+---------+
| first_name|last_name|
+-----------+---------+
|     Albert|   Crouse|
|    Alberto|  Henning|
|  Alexander|  Fennell|
|      Amber|    Dixon|
|    Bernard|    Colby|
|    Bernice|   Willis|
|     Bertha| Ferguson|
|    Beverly|   Brooks|
|  Catherine| Campbell|
|     Cheryl|   Murphy|
|    Chester|   Benner|
|Christopher|    Greco|
|      Derek|  Blakely|
|    Derrick|  Bourque|
|      Elmer|      Noe|
|     Esther| Crawford|
|    Everett|    Banda|
|   Fernando|Churchill|
|  Frederick|   Isbell|
|     Gerald|    Fultz|
+-----------+---------+
only showing top 20 rows



### Valid use cases

In [46]:
query18 =  """SELECT
                	first_name,
                    last_name
              FROM
                	customer
              WHERE
                	first_name LIKE '%er%'
              ORDER BY 
                    first_name;
              """
show_query(query18)

+-----------+---------+
| first_name|last_name|
+-----------+---------+
|     Albert|   Crouse|
|    Alberto|  Henning|
|  Alexander|  Fennell|
|      Amber|    Dixon|
|    Bernard|    Colby|
|    Bernice|   Willis|
|     Bertha| Ferguson|
|    Beverly|   Brooks|
|  Catherine| Campbell|
|     Cheryl|   Murphy|
|    Chester|   Benner|
|Christopher|    Greco|
|      Derek|  Blakely|
|    Derrick|  Bourque|
|      Elmer|      Noe|
|     Esther| Crawford|
|    Everett|    Banda|
|   Fernando|Churchill|
|  Frederick|   Isbell|
|     Gerald|    Fultz|
+-----------+---------+
only showing top 20 rows



In [47]:
query19 =  """SELECT
            	first_name,
            	last_name
              FROM
            	customer
              WHERE
            	first_name LIKE '_her%'
              ORDER BY 
                first_name;
              """
show_query(query19)

+----------+---------+
|first_name|last_name|
+----------+---------+
|    Cheryl|   Murphy|
|    Sherri|   Rhodes|
|    Sherry| Marshall|
|   Theresa|   Watson|
+----------+---------+



### ILIKE: case Insensitive LIKE

In [48]:
query20 =  """SELECT
	                first_name,
	                last_name
              FROM customer
              WHERE first_name
              ILIKE 'BAR%';
           """
show_query(query20)

+----------+---------+
|first_name|last_name|
+----------+---------+
|   Barbara|    Jones|
|     Barry| Lovelace|
+----------+---------+



# Example 9: IS NULL

### Why to use IS NULL? Because NULL is NOT equal to NULL!!!!!

#### Creating mock data

In [110]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [115]:
schema = StructType([ \
    StructField("id",IntegerType(),True), \
    StructField("first_name",StringType(),True), \
    StructField("last_name",StringType(),True), \
    StructField("email", StringType(), True), \
    StructField("phone", StringType(), True), \
  ])
contacts = [(1,"John","Doe",'john.doe@example.com',None),
            (2,"Lily","Bush",'lily.bush@example.com','(408-234-2764)')]
columns = ['id','first_name','last_name','email','phone']
rdd = spark.sparkContext.parallelize(contacts)

In [116]:
df_spark = spark.createDataFrame(rdd, schema = schema)

In [117]:
df_spark.createOrReplaceTempView('contacts')

### Checking if NULL equals NULL

In [119]:
query21 = """SELECT
    id,
    first_name,
    last_name,
    email,
    phone
FROM
    contacts
WHERE
    phone = NULL;
"""

In [120]:
show_query(query21)

+---+----------+---------+-----+-----+
| id|first_name|last_name|email|phone|
+---+----------+---------+-----+-----+
+---+----------+---------+-----+-----+



### Using IS NULL

In [121]:
query22 = """SELECT
    id,
    first_name,
    last_name,
    email,
    phone
FROM
    contacts
WHERE
    phone IS NULL;
"""

In [122]:
show_query(query22)

+---+----------+---------+--------------------+-----+
| id|first_name|last_name|               email|phone|
+---+----------+---------+--------------------+-----+
|  1|      John|      Doe|john.doe@example.com| null|
+---+----------+---------+--------------------+-----+



In [125]:
query23 = """SELECT
                    id,
                    first_name,
                    last_name,
                    email,
                    phone
              FROM
                    contacts
              WHERE
                    phone IS NOT NULL;"""

In [126]:
show_query(query23)

+---+----------+---------+--------------------+--------------+
| id|first_name|last_name|               email|         phone|
+---+----------+---------+--------------------+--------------+
|  2|      Lily|     Bush|lily.bush@example...|(408-234-2764)|
+---+----------+---------+--------------------+--------------+



## Exercises

1) Lord Zedd hired you to find his enemies in this universe. He knows that the target keeps from the other universe either the first or the last name intact, changing the other. In his Earth, their names were:
 - Jason Lee Scott
 - Zack Taylor
 - Trini Kwan
 - Kimberly Hart
 - Billy Cranston
 - Tommy Oliver
   
Give to Lord Zedd a list with the potential targets. Is there a deceased one?

2) Do ALL the above, including the exercises with the Spark SQL API: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/index.html