In [None]:
from pyspark.sql import SparkSession

# start the session
spark = SparkSession.builder.appName("JoinTypes").getOrCreate()

### Load dataframes

In [2]:
# git clone https://github.com/databricks/LearningSparkV2.git
tripdelaysFilePath = "databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
airportsnaFilePath = "databricks-datasets/learning-spark-v2/flights/airport-codes-na.txt"

In [7]:
airportsna = (spark.read
                    .format("csv")
                    .options(header="true", inferSchema="true", sep="\t")
                    .load(airportsnaFilePath))
airportsna.createOrReplaceTempView("airports_na")
spark.sql('Select count(*) as total_rows from airports_na').show()

+----------+
|total_rows|
+----------+
|       526|
+----------+



In [12]:
airportsna.show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



In [8]:
departureDelays = (spark.read
                        .format("csv")
                        .options(header="true")
                        .load(tripdelaysFilePath))
departureDelays.createOrReplaceTempView("departureDelays")
spark.sql('Select count(*) as total_rows from departureDelays').show()

+----------+
|total_rows|
+----------+
|   1391578|
+----------+



In [11]:
departureDelays.show(5)

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows



## JOIN types

### Inner join

In [31]:
from pyspark.sql.functions import sum, col

# get only five flights with the longest delays
airportsna.filter(airportsna.State == 'NY').select('IATA') \
    .join(departureDelays.filter(departureDelays.delay > 0).select('date', col('delay').cast('int'), 'origin', 'destination'),
          airportsna.IATA == departureDelays.destination,
          'inner').sort('delay', ascending=False).show(5)

+----+--------+-----+------+-----------+
|IATA|    date|delay|origin|destination|
+----+--------+-----+------+-----------+
| JFK|01300915| 1500|   EGE|        JFK|
| JFK|01031442| 1167|   SJU|        JFK|
| LGA|01011700| 1017|   MSP|        LGA|
| JFK|01291718|  978|   SJU|        JFK|
| JFK|02121625|  932|   HNL|        JFK|
+----+--------+-----+------+-----------+
only showing top 5 rows



In [18]:
airportsna.filter(airportsna.State == 'NY') \
    .join(departureDelays,
          airportsna.IATA == departureDelays.destination,
          'inner').select(sum('delay')).show(5)

+----------+
|sum(delay)|
+----------+
|  970213.0|
+----------+



### Full/Outer Join

In [62]:
airportsna.filter(airportsna.State == 'GA').select('IATA') \
    .join(departureDelays.select('date', col('delay').cast('int'), 'origin', 'destination') \
                         .filter((departureDelays.destination.isin(['ATL', 'JFK'])) & (departureDelays.delay <= 0)).sort('date').limit(5),
          airportsna.IATA == departureDelays.destination,
          'full').sort('delay', ascending=False).show()

+----+--------+-----+------+-----------+
|IATA|    date|delay|origin|destination|
+----+--------+-----+------+-----------+
|null|01010206|   -4|   SJU|        JFK|
| ATL|01010040|   -6|   SLC|        ATL|
| ATL|01010030|   -8|   LAS|        ATL|
|null|01010500|   -8|   SJU|        JFK|
| ATL|01010059|   -9|   DEN|        ATL|
| ABY|    null| null|  null|       null|
| AGS|    null| null|  null|       null|
| AHN|    null| null|  null|       null|
| BQK|    null| null|  null|       null|
| CSG|    null| null|  null|       null|
| MCN|    null| null|  null|       null|
| SAV|    null| null|  null|       null|
| VLD|    null| null|  null|       null|
+----+--------+-----+------+-----------+

