In [1]:
# DATA: departuredelays.csv

In [2]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import findspark
findspark.init()
import pyspark

In [3]:
# Let's get started by reading the data set into a temporary view.
from pyspark.sql import SparkSession 

# Create a SparkSession
spark = (SparkSession
 .builder
 .appName("SparkSQLExampleApp")
 .getOrCreate()) 

# Path to data set
csv_file = "C:/Users/mariajose.chinchilla/OneDrive - Bosonit/Escritorio/Bosonit/Spark/datarepositorio/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"

# Read and create a temporary view
# Infer schema (note that for larger files you 
# may want to specify the schema)
df = (spark.read.format("csv")
 .option("inferSchema", "true")
 .option("header", "true")
 .load(csv_file))
df.createOrReplaceTempView("us_delay_flights_tbl")

In [4]:
# If you want to specify a schema, you can use a DDL-formatted string
schema = "`date` STRING, `delay` INT, `distance` INT, `origin` STRING, `destination` STRING"

In [5]:
# Now that we have a temporary view, we can issue SQL queries using Spark SQL

In [6]:
# We will find all flights whose distance is greater than 1000 miles.
spark.sql("""SELECT distance, origin, destination 
FROM us_delay_flights_tbl WHERE distance > 1000 
ORDER BY distance DESC""").show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [7]:
# Find all flights between San Francisco (SFO) and Chicago (ORD) with at least a twho-hour delay
spark.sql("""SELECT date, delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
ORDER by delay DESC""").show(10)

+-------+-----+------+-----------+
|   date|delay|origin|destination|
+-------+-----+------+-----------+
|2190925| 1638|   SFO|        ORD|
|1031755|  396|   SFO|        ORD|
|1022330|  326|   SFO|        ORD|
|1051205|  320|   SFO|        ORD|
|1190925|  297|   SFO|        ORD|
|2171115|  296|   SFO|        ORD|
|1071040|  279|   SFO|        ORD|
|1051550|  274|   SFO|        ORD|
|3120730|  266|   SFO|        ORD|
|1261104|  258|   SFO|        ORD|
+-------+-----+------+-----------+
only showing top 10 rows



In [8]:
# As an exercise, convert the date column into a readable format and find the days or months when these delays were most commun. Were the delays related to winter months or holidays? 
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Define a UDF to convert the date format into a legible format.(UDF = user defined function)

def to_date_format_udf(d_str):
  l = [char for char in d_str]
  return "".join(l[0:2]) + "/" +  "".join(l[2:4]) + " " + " " +"".join(l[4:6]) + ":" + "".join(l[6:])

to_date_format_udf("02190925")

# Register the UDF
spark.udf.register("to_date_format_udf", to_date_format_udf, StringType())

# Read US departure flight data
df = (spark.read.format("csv")
      .schema("date STRING, delay INT, distance INT, origin STRING, destination STRING")
      .option("header", "true")
        .option("path", "C:/Users/mariajose.chinchilla/OneDrive - Bosonit/Escritorio/Bosonit/Spark/datarepositorio/databricks-datasets/learning-spark-v2/flights/departuredelays.csv")
      .load())

display(df)

# Test the UDF
df.selectExpr("to_date_format_udf(date) as data_format").show(10, truncate=False)

DataFrame[date: string, delay: int, distance: int, origin: string, destination: string]

+------------+
|data_format |
+------------+
|01/01  12:45|
|01/02  06:00|
|01/02  12:45|
|01/02  06:05|
|01/03  12:45|
|01/03  06:05|
|01/04  12:43|
|01/04  06:05|
|01/05  12:45|
|01/05  06:05|
+------------+
only showing top 10 rows



In [9]:
# Label all US flights, regardless of origin and destination, with an indication of the delays they experienced: Very Long Delays (>6 hours), Long Delays (2-6 hours), etc. We will add these human-readable labels in a new column called Flight_Delays:
spark.sql("""SELECT delay, origin, destination,
 CASE
 WHEN delay > 360 THEN 'Very Long Delays'
 WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
 WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
 WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
 WHEN delay = 0 THEN 'No Delays'
 ELSE 'Early'
 END AS Flight_Delays
 FROM us_delay_flights_tbl
 ORDER BY origin, delay DESC""").show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



In [10]:
# All three of the preceding SQL queries can be expressed with an equivalent DataFrame API query. For example, the first:
from pyspark.sql.functions import col, desc
(df.select("distance", "origin", "destination")
 .where(col("distance") > 1000)
 .orderBy(desc("distance"))).show(10)
spark.sql("""SELECT distance, origin, destination 
FROM us_delay_flights_tbl WHERE distance > 1000 
ORDER BY distance DESC""").show(10)
# Or
(df.select("distance", "origin", "destination")
 .where("distance > 1000")
 .orderBy("distance", ascending=False).show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
|    4330|   HNL|        JFK|
|    4330|   JFK|        HNL|
+--------+------+-----------+
only showing top 10 rows

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL| 

In [11]:
# EJERCICIO

In [12]:
from pyspark.sql.functions import when, col, desc

# As an exercise, try converting the other two SQL queries to use the DataFrame API.

#The second one
(df.select("date", "delay", "origin", "destination")
 .where("delay > 120 and origin='SFO' and destination ='ORD' ")
 .orderBy("delay", ascending=False).show(10))

# The third one
(df.select("delay", "origin", "destination") 
        .withColumn("Flight_Delays", when(col("delay") > 360, "Very Long Delays")
                    .when((col("delay") > 120) & (col("delay") <360), "Long Delays")
                    .when((col("delay") > 60) & (col("delay") < 120), "Short Delays")
                    .when((col("delay") > 0) & (col("delay") < 60), "Tolerable Delays")
                    .when(col("delay") == 0, "No Delays")
                    .otherwise("Early"))
.orderBy(col("origin"), col("delay").desc()).show(10))


+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
|02171115|  296|   SFO|        ORD|
|01071040|  279|   SFO|        ORD|
|01051550|  274|   SFO|        ORD|
|03120730|  266|   SFO|        ORD|
|01261104|  258|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 10 rows

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|     

In [13]:
#Create both a managed and an unmanaged table.

#To begin, we'll create a database called learn_spark_db and tell Spark we want to use that database
spark.sql("CREATE DATABASE IF NOT EXISTS learn_spark_db")
spark.sql("USE learn_spark_db")

#Create a managed table within the database learn_spark_db
spark.sql("DROP TABLE IF EXISTS managed_us_delay_flights_tbl")
spark.sql("CREATE TABLE IF NOT EXISTS managed_us_delay_flights_tbl (date STRING, delay INT, distance INT, origin STRING, destination STRING)")

#We can do the same thing using the DataFrame API

# Path to our US flight delays CSV file 
csv_file = "C:/Users/mariajose.chinchilla/OneDrive - Bosonit/Escritorio/Bosonit/Spark/datarepositorio/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
# Schema as defined in the preceding example
schema="date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file, schema=schema)
flights_df.write.saveAsTable("managed_us_delay_flights_tbl")

AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT).;
'CreateTable `spark_catalog`.`learn_spark_db`.`managed_us_delay_flights_tbl`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Ignore


In [None]:
#To create an unmanaged table from a data source such as CSV file:
spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT, distance INT, origin STRING, destination STRING) 
USING csv OPTIONS (PATH '/databricks-datasets/learning-spark-v2/flights/departuredelays.csv')""")

In [None]:
#To create an unmanaged table from a data source such as CSV file:
spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT, distance INT, origin STRING, destination STRING) 
USING csv OPTIONS (PATH '/databricks-datasets/learning-spark-v2/flights/departuredelays.csv')""")

#And within the DataFrame API
(flights_df
 .write
 .option("path", "/tmp/data/us_flights_delay")
 .saveAsTable("us_delay_flights_tbl"))