In [73]:
from db_tools.setup import setup
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [74]:
from pyspark.sql.dataframe import DataFrame

In [2]:
spark = setup()

23/10/22 19:38:11 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/10/22 19:38:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/22 19:38:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/22 19:38:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


SparkSession available as "spark"


In [3]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

# Exercises Answers

## 1 - pure SQL

In [4]:
query =  """SELECT
                	first_name,
                    last_name
              FROM
                	customer
              WHERE
                	first_name IN ('Jason',
                                   'Zack',
                                   'Trini',
                                   'Kimberly',
                                   'Billy',
                                   'Tommy')
                    OR last_name IN ('Lee','Scott','Kwan','Hart','Cranston','Oliver')
              ORDER BY 
                    first_name;
              """
show_query(query)

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Billy|   Poulin|
|      Dana|     Hart|
|      Ella|   Oliver|
|     Jason|Morrissey|
|  Kimberly|      Lee|
|   Rebecca|    Scott|
|     Tommy|  Collazo|
+----------+---------+



## 2 - redoing all with PysparkSQL API

### Example 1: Filtering names from the Database

### Getting the table

In [5]:
df = spark.sql("SELECT * FROM customer")

### First form

In [6]:
df.select(*['first_name','last_name']).filter("first_name = 'Jamie'").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Jamie|     Rice|
|     Jamie|    Waugh|
+----------+---------+



### Alternative version

In [7]:
df.select(*['first_name','last_name']).filter(df["first_name"] == 'Jamie').show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Jamie|     Rice|
|     Jamie|    Waugh|
+----------+---------+



### Where is a filter alias

In [8]:
df.select(*['first_name','last_name']).where(df["first_name"] == 'Jamie').show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Jamie|     Rice|
|     Jamie|    Waugh|
+----------+---------+



## Example 2 with variants

In [9]:
df.select(*['first_name','last_name']).filter("first_name = 'Jamie' AND last_name = 'Rice' ").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Jamie|     Rice|
+----------+---------+



In [10]:
df.select(*['first_name','last_name']).filter("last_name = 'Rodriguez' OR first_name = 'Adam'").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Laura|Rodriguez|
|      Adam|    Gooch|
+----------+---------+



In [11]:
df.select(*['first_name','last_name']).filter((df["first_name"] == 'Jamie') & (df["last_name"] == 'Rice')).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Jamie|     Rice|
+----------+---------+



In [12]:
df.select(*['first_name','last_name']).filter((df["first_name"] == 'Adam') | (df["last_name"] == 'Rodriguez')).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Laura|Rodriguez|
|      Adam|    Gooch|
+----------+---------+



## Example 3 with variants

In [13]:
df.select(*['first_name','last_name']).filter("first_name  IN ('Ann','Anne','Annie')").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|       Ann|    Evans|
|      Anne|   Powell|
|     Annie|  Russell|
+----------+---------+



In [14]:
df.select(*['first_name','last_name']).filter(df["first_name"].isin(['Ann','Anne','Annie'])).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|       Ann|    Evans|
|      Anne|   Powell|
|     Annie|  Russell|
+----------+---------+



### Like

In [15]:
df.select(*['first_name','last_name']).filter(df["first_name"].like("Ann%")).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|      Anna|     Hill|
|       Ann|    Evans|
|      Anne|   Powell|
|     Annie|  Russell|
|   Annette|    Olson|
+----------+---------+



In [16]:
df.select(*['first_name','last_name']).filter("first_name LIKE 'Ann%'").show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|      Anna|     Hill|
|       Ann|    Evans|
|      Anne|   Powell|
|     Annie|  Russell|
|   Annette|    Olson|
+----------+---------+



## Example 4: combining wildcards, string functions and BETWEEN clause

In [17]:
df.select('first_name',F.length('first_name')\
   .alias('name_length'))\
   .filter((F.col("first_name")\
   .like("A%")) & (F.col('name_length')\
   .between(3,5))).orderBy(F.col('name_length')).show()

+----------+-----------+
|first_name|name_length|
+----------+-----------+
|       Amy|          3|
|       Ann|          3|
|       Ana|          3|
|      Anna|          4|
|      Anne|          4|
|      Alma|          4|
|      Adam|          4|
|      Alan|          4|
|      Alex|          4|
|      Andy|          4|
|     Alice|          5|
|     Annie|          5|
|     Anita|          5|
|     Amber|          5|
|     April|          5|
|     Agnes|          5|
|     Aaron|          5|
|     Allen|          5|
|     Alvin|          5|
|     Angel|          5|
+----------+-----------+
only showing top 20 rows



## Example 5

In [18]:
df.select('first_name','last_name')\
   .filter((F.col("first_name")\
   .like("Bra%")) & (~F.col('last_name')\
   .like("Motley"))).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|    Brandy|   Graves|
|   Brandon|     Huey|
|      Brad|  Mccurdy|
+----------+---------+



## Example 6

In [19]:
df_film = spark.sql("SELECT * FROM film")

In [20]:
df_film.select('film_id','title','release_year').orderBy('film_id').limit(5).show()

+-------+----------------+------------+
|film_id|           title|release_year|
+-------+----------------+------------+
|      1|Academy Dinosaur|        2006|
|      2|  Ace Goldfinger|        2006|
|      3|Adaptation Holes|        2006|
|      4|Affair Prejudice|        2006|
|      5|     African Egg|        2006|
+-------+----------------+------------+



## Example 7

In [21]:
df_film.select('film_id','title','release_year').orderBy('film_id').limit(5).show()

+-------+----------------+------------+
|film_id|           title|release_year|
+-------+----------------+------------+
|      1|Academy Dinosaur|        2006|
|      2|  Ace Goldfinger|        2006|
|      3|Adaptation Holes|        2006|
|      4|Affair Prejudice|        2006|
|      5|     African Egg|        2006|
+-------+----------------+------------+



### Offset is NOT present in pyspark 3.4.1. You can change container parameters to install spark 3.5.0 and pyspark 3.5.0. Here, we will implement offset logic

In [28]:
df_film.select('film_id','title','release_year').orderBy('film_id').limit(5).offset(3)

AttributeError: 'DataFrame' object has no attribute 'offset'

### Implementing OFFSET idea

In [38]:
offset = 10
df_film.select('film_id','title','release_year').orderBy('film_id').filter(f'film_id>{offset}').limit(5).show()

+-------+---------------+------------+
|film_id|          title|release_year|
+-------+---------------+------------+
|     11|Alamo Videotape|        2006|
|     12| Alaska Phantom|        2006|
|     13|    Ali Forever|        2006|
|     14| Alice Fantasia|        2006|
|     15|   Alien Center|        2006|
+-------+---------------+------------+



## Example 8

In [40]:
df_film.select('film_id','title','rental_rate').orderBy('rental_rate',ascending = False).limit(10).show()

+-------+-------------------+-----------+
|film_id|              title|rental_rate|
+-------+-------------------+-----------+
|      7|    Airplane Sierra|       4.99|
|    384|   Grosse Wonderful|       4.99|
|      8|    Airport Pollock|       4.99|
|     98|  Bright Encounters|       4.99|
|      2|     Ace Goldfinger|       4.99|
|    133|    Chamber Italian|       4.99|
|     10|   Aladdin Calendar|       4.99|
|     13|        Ali Forever|       4.99|
|     20|Amelie Hellfighters|       4.99|
|     21|    American Circus|       4.99|
+-------+-------------------+-----------+



## More examples: most are trivial with the above answers

### Non trivial one

In [51]:
df_customer = spark.sql('SELECT * FROM customer')
df_customer.select('customer_id',
                	'first_name',
                	'last_name')\
                    .filter("""customer_id IN (
                		SELECT customer_id
                		FROM rental
                		WHERE CAST (return_date AS DATE) = '2005-05-27')""").orderBy('customer_id').show()

+-----------+----------+----------+
|customer_id|first_name| last_name|
+-----------+----------+----------+
|         37|    Pamela|     Baker|
|         47|   Frances|    Parker|
|         48|       Ann|     Evans|
|         65|      Rose|    Howard|
|         73|   Beverly|    Brooks|
|         75|     Tammy|   Sanders|
|         93|   Phyllis|    Foster|
|        114|     Grace|     Ellis|
|        119|    Sherry|  Marshall|
|        131|    Monica|     Hicks|
|        158|  Veronica|     Stone|
|        167|     Sally|    Pierce|
|        182|     Renee|      Lane|
|        184|    Vivian|      Ruiz|
|        185|   Roberta|    Harper|
|        211|    Stacey|Montgomery|
|        239|    Minnie|    Romero|
|        247|    Stella|    Moreno|
|        251|    Vickie|    Brewer|
|        256|     Mabel|   Holland|
+-----------+----------+----------+
only showing top 20 rows



## Not and IN

In [84]:
df_rental = spark.sql('SELECT * FROM rental')
df_rental.select('customer_id',
                 'rental_id',
                 'return_date').where(~F.col('customer_id').isin([1,2])).show()

+-----------+---------+-------------------+
|customer_id|rental_id|        return_date|
+-----------+---------+-------------------+
|        459|        2|2005-05-28 19:40:33|
|        408|        3|2005-06-01 22:12:39|
|        333|        4|2005-06-03 01:43:41|
|        222|        5|2005-06-02 04:33:21|
|        549|        6|2005-05-27 01:32:07|
|        269|        7|2005-05-29 20:34:53|
|        239|        8|2005-05-27 23:33:46|
|        126|        9|2005-05-28 00:22:40|
|        399|       10|2005-05-31 22:44:21|
|        142|       11|2005-06-02 20:56:02|
|        261|       12|2005-05-30 05:44:27|
|        334|       13|2005-05-30 04:28:55|
|        446|       14|2005-05-26 02:56:15|
|        319|       15|2005-06-03 03:30:22|
|        316|       16|2005-05-26 04:42:11|
|        575|       17|2005-05-27 00:43:36|
|         19|       18|2005-05-31 06:35:47|
|        456|       19|2005-05-31 06:00:24|
|        185|       20|2005-05-27 02:20:41|
|        388|       21|2005-05-2

## Not and BETWEEN

In [86]:
df_payment = spark.sql('SELECT * FROM payment')
df_payment.select('customer_id',
                  'payment_id',
                  'amount').where(~df_payment.amount.between(8,9)).show()

+-----------+----------+------+
|customer_id|payment_id|amount|
+-----------+----------+------+
|        341|     17503|  7.99|
|        341|     17504|  1.99|
|        341|     17505|  7.99|
|        341|     17506|  2.99|
|        341|     17507|  7.99|
|        341|     17508|  5.99|
|        342|     17509|  5.99|
|        342|     17510|  5.99|
|        342|     17511|  2.99|
|        343|     17512|  4.99|
|        343|     17513|  6.99|
|        343|     17514|  0.99|
|        343|     17515|  0.99|
|        343|     17516|  6.99|
|        343|     17518|  0.99|
|        344|     17519|  3.99|
|        344|     17520|  4.99|
|        344|     17521|  0.99|
|        345|     17522|  0.99|
|        345|     17523|  4.99|
+-----------+----------+------+
only showing top 20 rows



## BETWEEN and Dates

In [72]:
spark.sql('SELECT * FROM payment').select(*['customer_id','payment_id','amount','payment_date']).filter(F.col('payment_date').between('2007-02-07','2007-02-15')).show()

+-----------+----------+------+--------------------+
|customer_id|payment_id|amount|        payment_date|
+-----------+----------+------+--------------------+
|        368|     17610|  0.99|2007-02-14 23:25:...|
|        370|     17617|  6.99|2007-02-14 23:33:...|
|        402|     17743|  4.99|2007-02-14 23:53:...|
|        416|     17793|  2.99|2007-02-14 21:21:...|
|        432|     17854|  5.99|2007-02-14 23:07:...|
|        481|     18051|  2.99|2007-02-14 22:03:...|
|        512|     18155|  6.99|2007-02-14 22:57:...|
|        516|     18173|  4.99|2007-02-14 21:23:...|
|        546|     18276|  1.99|2007-02-14 23:10:...|
|        561|     18322|  2.99|2007-02-14 23:52:...|
|        592|     18441|  6.99|2007-02-14 21:41:...|
|        595|     18456|  2.99|2007-02-14 22:16:...|
|          1|     18495|  5.99|2007-02-14 23:22:...|
|         46|     18686|  4.99|2007-02-14 21:45:...|
|         49|     18698|  0.99|2007-02-14 21:44:...|
|         95|     18870|  2.99|2007-02-14 22:4

## IS NULL

In [76]:
schema = StructType([ \
    StructField("id",IntegerType(),True), \
    StructField("first_name",StringType(),True), \
    StructField("last_name",StringType(),True), \
    StructField("email", StringType(), True), \
    StructField("phone", StringType(), True), \
  ])
contacts = [(1,"John","Doe",'john.doe@example.com',None),
            (2,"Lily","Bush",'lily.bush@example.com','(408-234-2764)')]
columns = ['id','first_name','last_name','email','phone']
rdd = spark.sparkContext.parallelize(contacts)
df_spark = spark.createDataFrame(rdd, schema = schema)

### NULL IS NOT equal to NULL

In [81]:
df_spark.select('*').filter(F.col('phone') == None).show()

+---+----------+---------+-----+-----+
| id|first_name|last_name|email|phone|
+---+----------+---------+-----+-----+
+---+----------+---------+-----+-----+



In [79]:
df_spark.select('*').filter(F.col('phone').isNull()).show()

+---+----------+---------+--------------------+-----+
| id|first_name|last_name|               email|phone|
+---+----------+---------+--------------------+-----+
|  1|      John|      Doe|john.doe@example.com| null|
+---+----------+---------+--------------------+-----+



In [80]:
df_spark.select('*').filter(~F.col('phone').isNull()).show()

+---+----------+---------+--------------------+--------------+
| id|first_name|last_name|               email|         phone|
+---+----------+---------+--------------------+--------------+
|  2|      Lily|     Bush|lily.bush@example...|(408-234-2764)|
+---+----------+---------+--------------------+--------------+



## Exercise

In [89]:
df_customer = spark.sql("SELECT * FROM customer")

In [92]:
df_customer.select('first_name','last_name')\
  .filter((F.col('first_name')\
  .isin(['Jason',
         'Zack',
         'Trini',
         'Kimberly',
         'Billy',
         'Tommy'])) |
         (F.col('last_name')\
  .isin(['Lee','Scott','Kwan','Hart','Cranston','Oliver']))).orderBy('first_name').show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     Billy|   Poulin|
|      Dana|     Hart|
|      Ella|   Oliver|
|     Jason|Morrissey|
|  Kimberly|      Lee|
|   Rebecca|    Scott|
|     Tommy|  Collazo|
+----------+---------+

