In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate()

In [9]:
csv_file = "data/departuredelays.csv"

file_schema = "date STRING, delay INT, distance INT,origin STRING, destination STRING"

In [10]:
#By setting inferSchema=true, 
#Spark will automatically go through the csv file
#and infer the schema of each column.

df = (
    spark.read.format("csv")
    .schema(file_schema)
    .option("inferSchema","true")
    .option("header","true")
    .load(csv_file)
    
)

df.createOrReplaceTempView("us_delay_flights_tbl")

In [None]:
#If you want to specify a schema, you can use a DDL-formatted
#string. For example:

# In Python
'''
schema = "`date` STRING, `delay` INT, `distance` INT,
`origin` STRING, `destination` STRING"

'''

In [None]:
df.printSchema()

In [11]:
spark.sql('''
SELECT distance, origin, destination FROM us_delay_flights_tbl
WHERE distance > 1000 ORDER BY distance DESC
''').show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [12]:
spark.sql('''

SELECT date, delay, origin, destination FROM us_delay_flights_tbl
WHERE delay > 120 AND origin = 'SFO' and destination = 'ORD'
ORDER BY delay DESC
''').show(10)

+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
|02171115|  296|   SFO|        ORD|
|01071040|  279|   SFO|        ORD|
|01051550|  274|   SFO|        ORD|
|03120730|  266|   SFO|        ORD|
|01261104|  258|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 10 rows



In [13]:
#UDF For convert date to readable date
def to_date_format_udf(d_str):
    l = [char for char in d_str]
    return "".join(l[0:2]) + "/" +  "".join(l[2:4]) + " " + " " +"".join(l[4:6]) + ":" + "".join(l[6:])

In [14]:
to_date_format_udf("02190925")

'02/19  09:25'

In [15]:
#Register the UDF
spark.udf.register("to_date_format_udf", to_date_format_udf,StringType())

<function __main__.to_date_format_udf(d_str)>

In [16]:
(
    df
    .selectExpr("to_date_format_udf(date) as data_format")
    .show(10, truncate=False)
)

+------------+
|data_format |
+------------+
|01/01  12:45|
|01/02  06:00|
|01/02  12:45|
|01/02  06:05|
|01/03  12:45|
|01/03  06:05|
|01/04  12:43|
|01/04  06:05|
|01/05  12:45|
|01/05  06:05|
+------------+
only showing top 10 rows



In [17]:
spark.sql('''
SELECT *, date, to_date_format_udf(date) as date_fm FROM
us_delay_flights_tbl
''').show(10)

+--------+-----+--------+------+-----------+--------+------------+
|    date|delay|distance|origin|destination|    date|     date_fm|
+--------+-----+--------+------+-----------+--------+------------+
|01011245|    6|     602|   ABE|        ATL|01011245|01/01  12:45|
|01020600|   -8|     369|   ABE|        DTW|01020600|01/02  06:00|
|01021245|   -2|     602|   ABE|        ATL|01021245|01/02  12:45|
|01020605|   -4|     602|   ABE|        ATL|01020605|01/02  06:05|
|01031245|   -4|     602|   ABE|        ATL|01031245|01/03  12:45|
|01030605|    0|     602|   ABE|        ATL|01030605|01/03  06:05|
|01041243|   10|     602|   ABE|        ATL|01041243|01/04  12:43|
|01040605|   28|     602|   ABE|        ATL|01040605|01/04  06:05|
|01051245|   88|     602|   ABE|        ATL|01051245|01/05  12:45|
|01050605|    9|     602|   ABE|        ATL|01050605|01/05  06:05|
+--------+-----+--------+------+-----------+--------+------------+
only showing top 10 rows



In [20]:
spark.sql('''

SELECT to_date_format_udf(date), delay, origin, destination FROM us_delay_flights_tbl
WHERE delay > 120 AND origin = 'SFO' and destination = 'ORD'
ORDER BY delay DESC

''').show(10)

AnalysisException: Undefined function: 'DATEPART'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 3 pos 7