# Learning Spark - Chapter 4 (Python)
## Spark SQL and DataFrames: Introduction to Built-in Data Sources

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession
         .builder
         .appName("SparkSQLExampleApp")
         .master("local")
         .enableHiveSupport()
         .getOrCreate())

## Basic Query Examples

In [2]:
# Path to data set
csvFile = "./departuredelays.csv"

In [3]:
# Read and create a temporary view
# Infer schema (note that for larger files you
# may want to specify the schema)
df = (spark.read.format("csv")
.option("inferSchema", "true")
.option("header", "true")
.load(csvFile))

In [4]:
df.createOrReplaceTempView("us_delay_flights_tbl")

##### distance is greater than 1,000 miles

In [5]:
(spark.sql("""SELECT * 
FROM us_delay_flights_tbl 
WHERE distance > 1000 ORDER BY distance DESC""")
.show())

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011625|   -4|    4330|   HNL|        JFK|
|1211625|  115|    4330|   HNL|        JFK|
|1021625|  110|    4330|   HNL|        JFK|
|1031625|   -1|    4330|   HNL|        JFK|
|1041625|   -7|    4330|   HNL|        JFK|
|1051625|   18|    4330|   HNL|        JFK|
|1061625|    0|    4330|   HNL|        JFK|
|1071625|    0|    4330|   HNL|        JFK|
|1081625|   -5|    4330|   HNL|        JFK|
|1091625|    0|    4330|   HNL|        JFK|
|1101625|   -2|    4330|   HNL|        JFK|
|1111625|  -10|    4330|   HNL|        JFK|
|1121625|    6|    4330|   HNL|        JFK|
|1131625|    0|    4330|   HNL|        JFK|
|1141625|   -1|    4330|   HNL|        JFK|
|1151625|   -9|    4330|   HNL|        JFK|
|1161625|   -8|    4330|   HNL|        JFK|
|1171625|    0|    4330|   HNL|        JFK|
|1181625|   -8|    4330|   HNL|        JFK|
|1191625|  -10|    4330|   HNL| 

##### all flights between San Francisco (SFO) and Chicago (ORD) with at least a two-hour delay

In [6]:
spark.sql("""SELECT * 
FROM us_delay_flights_tbl 
WHERE delay >= 120 
AND 
(origin == 'SFO' AND destination = 'ORD' 
OR origin == 'ORD' AND destination = 'SFO')""").show(50)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1021430|  132|    1604|   ORD|        SFO|
|1021605|  140|    1604|   ORD|        SFO|
|1021025|  148|    1604|   ORD|        SFO|
|1022015|  122|    1604|   ORD|        SFO|
|1052015|  138|    1604|   ORD|        SFO|
|1061430|  183|    1604|   ORD|        SFO|
|1061900|  155|    1604|   ORD|        SFO|
|1061025|  144|    1604|   ORD|        SFO|
|1301430|  228|    1604|   ORD|        SFO|
|1301605|  187|    1604|   ORD|        SFO|
|1011744|  154|    1604|   ORD|        SFO|
|1011541|  123|    1604|   ORD|        SFO|
|1011310|  130|    1604|   ORD|        SFO|
|1011903|  158|    1604|   ORD|        SFO|
|1020948|  342|    1604|   ORD|        SFO|
|1031744|  167|    1604|   ORD|        SFO|
|1031541|  278|    1604|   ORD|        SFO|
|1041744|  247|    1604|   ORD|        SFO|
|1041310|  159|    1604|   ORD|        SFO|
|1041901|  179|    1604|   ORD| 

##### label all US flights, regardless of origin and destination,with an indication of the delays they experienced: Very Long Delays (> 6 hours), Long Delays (2–6 hours), etc. We’ll add these human-readable labels in a new column called Flight_Delays

In [7]:
spark.sql("""SELECT delay, origin, destination,
CASE
WHEN delay > 360 THEN 'Very Long Delays'
WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
WHEN delay = 0 THEN 'No Delays'
ELSE 'Early'
END AS Flight_Delays
FROM us_delay_flights_tbl
ORDER BY origin, delay DESC""").show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



##### distance is greater than 1,000 miles in python

In [8]:
from pyspark.sql.functions import col, desc

In [9]:
df.where(col('distance')> 1000).orderBy(desc('distance')).show()

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011625|   -4|    4330|   HNL|        JFK|
|1211625|  115|    4330|   HNL|        JFK|
|1021625|  110|    4330|   HNL|        JFK|
|1031625|   -1|    4330|   HNL|        JFK|
|1041625|   -7|    4330|   HNL|        JFK|
|1051625|   18|    4330|   HNL|        JFK|
|1061625|    0|    4330|   HNL|        JFK|
|1071625|    0|    4330|   HNL|        JFK|
|1081625|   -5|    4330|   HNL|        JFK|
|1091625|    0|    4330|   HNL|        JFK|
|1101625|   -2|    4330|   HNL|        JFK|
|1111625|  -10|    4330|   HNL|        JFK|
|1121625|    6|    4330|   HNL|        JFK|
|1131625|    0|    4330|   HNL|        JFK|
|1141625|   -1|    4330|   HNL|        JFK|
|1151625|   -9|    4330|   HNL|        JFK|
|1161625|   -8|    4330|   HNL|        JFK|
|1171625|    0|    4330|   HNL|        JFK|
|1181625|   -8|    4330|   HNL|        JFK|
|1191625|  -10|    4330|   HNL| 

In [10]:
(df.select("distance", "origin", "destination")
.where("distance > 1000")
.orderBy("distance", ascending=False).show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



### Creating SQL Databases and Tables

In [11]:
spark.sql("CREATE DATABASE IF NOT EXISTS learn_spark_db1")
spark.sql("USE learn_spark_db1")

DataFrame[]

In [12]:
spark.sql("""CREATE TABLE IF NOT EXISTS managed_us_delay_flights_tbl2
(date STRING, delay INT,distance INT, origin STRING, destination STRING)""")

DataFrame[]

In [13]:
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csvFile, schema=schema)

In [15]:
flights_df.write.saveAsTable("managed_us_delay_flights_tbl4")

In [17]:
spark.sql("""CREATE TABLE us_delay_flights_tbl_4(date STRING, delay INT, distance INT, origin STRING, destination STRING) 
USING csv OPTIONS (PATH'./departuredelays.csv')""")

DataFrame[]

In [19]:
(flights_df
.write
.option("path", "./us_flights_delay")
.saveAsTable("us_delay_flights_tbl_6"))

In [20]:
from pyspark.sql.functions import col, desc

In [21]:

(df.select("distance", "origin", "destination")
.where("distance > 1000")
.orderBy("distance", ascending=False).show(10))


+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [22]:
spark.sql("""CREATE OR REPLACE GLOBAL TEMP VIEW us_origin_airport_SFO_global_tmp_view 
AS SELECT date, delay, origin, destination 
from us_delay_flights_tbl WHERE origin = 'SFO';""")

DataFrame[]

#### Data Sources for DataFrames and SQL Tables

In [23]:
file = "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet"
# Use Parquet
df = (spark.read.format("parquet").load(file))
# Use Parquet; you can omit format("parquet") if you wish as it's the default
df2 = (spark.read.load(file))
# Use CSV
df3 = (spark.read.format("csv").option("inferSchema", "true").option("header", "true").option("mode", "PERMISSIVE")
       .load("C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*"))
# Use JSON
df4 = (spark.read.format("json").load("C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/json/*"))

In [24]:
df4.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [25]:
# Reading Parquet files into a Spark SQL table
(spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
USING parquet
OPTIONS (path "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet/" )"""))

DataFrame[]

In [26]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

#### Writing DataFrames to Parquet files

In [27]:
(df.write.format("parquet")
.mode("overwrite")
.option("compression", "snappy")
.save("./df_parquet_flights"))

#### Writing DataFrames to JSON files

In [28]:
(df.write.format("json")
.mode("overwrite")
#.option("compression", "snappy")
.save("./df_json_2"))

In [29]:
# writing csv
df.write.format("csv").mode("overwrite").save("./df_csv")

### avro

In [30]:
df = (spark.read.format("avro")
.load("C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"))

df.show(truncate=False)

+--------------------------------+-------------------+-----+
|DEST_COUNTRY_NAME               |ORIGIN_COUNTRY_NAME|count|
+--------------------------------+-------------------+-----+
|United States                   |Romania            |1    |
|United States                   |Ireland            |264  |
|United States                   |India              |69   |
|Egypt                           |United States      |24   |
|Equatorial Guinea               |United States      |1    |
|United States                   |Singapore          |25   |
|United States                   |Grenada            |54   |
|Costa Rica                      |United States      |477  |
|Senegal                         |United States      |29   |
|United States                   |Marshall Islands   |44   |
|Guyana                          |United States      |17   |
|United States                   |Sint Maarten       |53   |
|Malta                           |United States      |1    |
|Bolivia                

In [31]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW episode_tbl
USING avro
OPTIONS (
path "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"
)""")

DataFrame[]

#### orc

In [32]:
file = "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*"
df = spark.read.format("orc").option("path", file).load()
df.show(10, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [33]:
(df.write.format("orc")
.mode("overwrite")
.option("compression", "snappy")
.save("./flights_orc"))

## images

In [34]:
from pyspark.ml import image

In [35]:
image_dir = "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/JM Jupyter/cctvVideos/train_images/"
images_df = spark.read.format("image").load(image_dir)
images_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)



In [36]:
images_df.select("image.height", "image.width", "image.nChannels", "image.mode","label").show(5, truncate=False)

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |1    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
+------+-----+---------+----+-----+
only showing top 5 rows



### Binary Files

In [37]:
path = "./cctvVideos/train_images/"
binary_files_df = (spark.read.format("binaryFile")
.option("pathGlobFilter", "*.jpg")
.load(path))
binary_files_df.show(5)

+--------------------+-------------------+------+--------------------+-----+
|                path|   modificationTime|length|             content|label|
+--------------------+-------------------+------+--------------------+-----+
|file:/C:/Users/jo...|2021-04-15 02:34:17| 55037|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54634|[FF D8 FF E0 00 1...|    1|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54624|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54505|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54475|[FF D8 FF E0 00 1...|    0|
+--------------------+-------------------+------+--------------------+-----+
only showing top 5 rows



In [38]:
binary_files_df = (spark.read.format("binaryFile")
.option("pathGlobFilter", "*.jpg")
.option("recursiveFileLookup", "true")
.load(path))
binary_files_df.show(5)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/C:/Users/jo...|2021-04-15 02:34:17| 55037|[FF D8 FF E0 00 1...|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54634|[FF D8 FF E0 00 1...|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54624|[FF D8 FF E0 00 1...|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54505|[FF D8 FF E0 00 1...|
|file:/C:/Users/jo...|2021-04-15 02:34:17| 54475|[FF D8 FF E0 00 1...|
+--------------------+-------------------+------+--------------------+
only showing top 5 rows

