# Connecting to the database

### OBS: you must have the container with dvdrental available and connected via network or equivalent

In [1]:
from db_tools.setup import setup

### Getting a SparkSession 

In [2]:
spark = setup()

23/10/21 15:04:33 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.10 instead (on interface wlp0s20f3)
23/10/21 15:04:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/21 15:04:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [9]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

# Section 1. Querying Data.
### To check the ERD of DVD rental database, please check the support_material folder

## Example 1: getting customer first name, last name and email from the database

In [13]:
query1 = """SELECT
               first_name,
               last_name,
               email
            FROM
           customer;"""

In [14]:
show_query(query1)

+----------+---------+--------------------+
|first_name|last_name|               email|
+----------+---------+--------------------+
|     Jared|      Ely|jared.ely@sakilac...|
|      Mary|    Smith|mary.smith@sakila...|
|  Patricia|  Johnson|patricia.johnson@...|
|     Linda| Williams|linda.williams@sa...|
|   Barbara|    Jones|barbara.jones@sak...|
| Elizabeth|    Brown|elizabeth.brown@s...|
|  Jennifer|    Davis|jennifer.davis@sa...|
|     Maria|   Miller|maria.miller@saki...|
|     Susan|   Wilson|susan.wilson@saki...|
|  Margaret|    Moore|margaret.moore@sa...|
|   Dorothy|   Taylor|dorothy.taylor@sa...|
|      Lisa| Anderson|lisa.anderson@sak...|
|     Nancy|   Thomas|nancy.thomas@saki...|
|     Karen|  Jackson|karen.jackson@sak...|
|     Betty|    White|betty.white@sakil...|
|     Helen|   Harris|helen.harris@saki...|
|    Sandra|   Martin|sandra.martin@sak...|
|     Donna| Thompson|donna.thompson@sa...|
|     Carol|   Garcia|carol.garcia@saki...|
|      Ruth| Martinez|ruth.marti

## Example 2: getting customer full name and email from the database. Note the columns!

In [17]:
query2 = """SELECT
               first_name ||' '|| last_name,
               email
            FROM
           customer;"""

In [16]:
show_query(query2)

+----------------------------------------+--------------------+
|concat(concat(first_name,  ), last_name)|               email|
+----------------------------------------+--------------------+
|                               Jared Ely|jared.ely@sakilac...|
|                              Mary Smith|mary.smith@sakila...|
|                        Patricia Johnson|patricia.johnson@...|
|                          Linda Williams|linda.williams@sa...|
|                           Barbara Jones|barbara.jones@sak...|
|                         Elizabeth Brown|elizabeth.brown@s...|
|                          Jennifer Davis|jennifer.davis@sa...|
|                            Maria Miller|maria.miller@saki...|
|                            Susan Wilson|susan.wilson@saki...|
|                          Margaret Moore|margaret.moore@sa...|
|                          Dorothy Taylor|dorothy.taylor@sa...|
|                           Lisa Anderson|lisa.anderson@sak...|
|                            Nancy Thoma

# Example 3: using an alias (AS)

In [22]:
query3 = """SELECT
               first_name ||' '|| last_name AS last_name,
               email
            FROM
           customer;"""

In [23]:
show_query(query3)

+----------------+--------------------+
|       last_name|               email|
+----------------+--------------------+
|       Jared Ely|jared.ely@sakilac...|
|      Mary Smith|mary.smith@sakila...|
|Patricia Johnson|patricia.johnson@...|
|  Linda Williams|linda.williams@sa...|
|   Barbara Jones|barbara.jones@sak...|
| Elizabeth Brown|elizabeth.brown@s...|
|  Jennifer Davis|jennifer.davis@sa...|
|    Maria Miller|maria.miller@saki...|
|    Susan Wilson|susan.wilson@saki...|
|  Margaret Moore|margaret.moore@sa...|
|  Dorothy Taylor|dorothy.taylor@sa...|
|   Lisa Anderson|lisa.anderson@sak...|
|    Nancy Thomas|nancy.thomas@saki...|
|   Karen Jackson|karen.jackson@sak...|
|     Betty White|betty.white@sakil...|
|    Helen Harris|helen.harris@saki...|
|   Sandra Martin|sandra.martin@sak...|
|  Donna Thompson|donna.thompson@sa...|
|    Carol Garcia|carol.garcia@saki...|
|   Ruth Martinez|ruth.martinez@sak...|
+----------------+--------------------+
only showing top 20 rows



## Extra: SQL works as a calculator

In [26]:
query4 = """SELECT 10*10 AS RESULT"""

In [27]:
show_query(query4)

+------+
|RESULT|
+------+
|   100|
+------+



## Example 4: ORDERing columns with ORDER BY

In [47]:
show_query("""SELECT
            	first_name,
            	last_name
              FROM customer
              ORDER BY
	          first_name ASC;""")

+----------+-----------+
|first_name|  last_name|
+----------+-----------+
|     Aaron|      Selby|
|      Adam|      Gooch|
|    Adrian|      Clary|
|     Agnes|     Bishop|
|      Alan|       Kahn|
|    Albert|     Crouse|
|   Alberto|    Henning|
|      Alex|    Gresham|
| Alexander|    Fennell|
|    Alfred|   Casillas|
|   Alfredo|    Mcadams|
|     Alice|    Stewart|
|    Alicia|      Mills|
|     Allan|    Cornish|
|     Allen|Butterfield|
|   Allison|    Stanley|
|      Alma|     Austin|
|     Alvin|    Deloach|
|    Amanda|     Carter|
|     Amber|      Dixon|
+----------+-----------+
only showing top 20 rows



## Example 5: Ordering the last name descending

In [50]:
query5 = '''SELECT first_name,
                  last_name
           FROM customer
           ORDER BY last_name DESC;'''

In [51]:
show_query(query5)

+----------+------------+
|first_name|   last_name|
+----------+------------+
|   Cynthia|       Young|
|    Marvin|         Yee|
|      Luis|       Yanez|
|     Brian|       Wyman|
|    Brenda|      Wright|
|     Tyler|        Wren|
|  Florence|       Woods|
|      Lori|        Wood|
|    Virgil|     Wofford|
|    Darren|     Windham|
|     Susan|      Wilson|
|   Bernice|      Willis|
|      Gina|  Williamson|
|     Linda|    Williams|
|       Jon|       Wiles|
|       Roy|     Whiting|
|     Betty|       White|
|      Lucy|     Wheeler|
|      Fred|       Wheat|
|  Mitchell|Westmoreland|
+----------+------------+
only showing top 20 rows



# Example 6: Multiple ORDER BY

In [56]:
query6= """SELECT
            	first_name,
            	last_name
           FROM customer
           ORDER BY
           first_name ASC,
           last_name DESC;
        """

In [57]:
show_query(query6)

+----------+-----------+
|first_name|  last_name|
+----------+-----------+
|     Aaron|      Selby|
|      Adam|      Gooch|
|    Adrian|      Clary|
|     Agnes|     Bishop|
|      Alan|       Kahn|
|    Albert|     Crouse|
|   Alberto|    Henning|
|      Alex|    Gresham|
| Alexander|    Fennell|
|    Alfred|   Casillas|
|   Alfredo|    Mcadams|
|     Alice|    Stewart|
|    Alicia|      Mills|
|     Allan|    Cornish|
|     Allen|Butterfield|
|   Allison|    Stanley|
|      Alma|     Austin|
|     Alvin|    Deloach|
|    Amanda|     Carter|
|     Amber|      Dixon|
+----------+-----------+
only showing top 20 rows



## Example 7: Ordering Names by its length

In [64]:
query7 = """SELECT 
                	first_name,
                    last_name,
                	LENGTH(first_name) AS first_name_len
            FROM customer
            ORDER BY first_name_len DESC;"""
show_query(query7)

+-----------+---------+--------------+
| first_name|last_name|first_name_len|
+-----------+---------+--------------+
|Christopher|    Greco|            11|
| Jacqueline|     Long|            10|
|  Charlotte|   Hunter|             9|
|  Stephanie| Mitchell|             9|
|  Christine|  Roberts|             9|
|  Catherine| Campbell|             9|
|  Katherine|   Rivera|             9|
|  Christina|  Ramirez|             9|
|  Elizabeth|    Brown|             9|
|  Josephine|    Gomez|             9|
|  Geraldine|  Perkins|             9|
|  Constance|     Reid|             9|
|  Gwendolyn|      May|             9|
|  Cassandra|  Walters|             9|
|  Priscilla|     Lowe|             9|
|  Frederick|   Isbell|             9|
|  Alexander|  Fennell|             9|
|  Francisco| Skidmore|             9|
|  Nathaniel|     Adam|             9|
|  Christian|     Jung|             9|
+-----------+---------+--------------+
only showing top 20 rows



## Obs: according with your needs, you can select the nulls order of appearance by using NULLS LAST or NULLS FIRST

## Example 8: Select Distinct

In [67]:
query8 = """SELECT DISTINCT last_name FROM customer"""

In [68]:
show_query(query8)

+-----------+
|  last_name|
+-----------+
|     Easter|
|    Wofford|
|    Currier|
|   Harrison|
|     Porter|
|     Wilson|
|      Scott|
|  Robertson|
|     Fraley|
|      Slone|
|    Malcolm|
|    Griffin|
|     Castro|
|      Lucas|
|       Pena|
|     Gruber|
|     Isbell|
|      Lyman|
|      Abney|
|Christenson|
+-----------+
only showing top 20 rows



In [None]:
query9 = """SELECT first_name,last_name,regexp_extract(email,r'.*(?=@)',0) AS user_name FROM customer"""

In [None]:
show_query(query9)

In [None]:
query10 = """SELECT first_name,last_name,regexp_extract(email,r'(?<=@).*',0) AS domain FROM customer"""

In [None]:
show_query(query10)

## Exercises

## 1) What is the customer with the greatest last name?

## 2) What is the customer with greatest full name?

## Challenge: extract the user name, e.g (name.last_name) from the e-mail using _pure_ SQL