In [1]:
import pyspark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import functions as func
from pyspark.sql.types import StringType,FloatType
import nltk
from pyspark.sql import SparkSession


In [2]:
# reading in flight data csv
spark = SparkSession.builder.master('local[*]').appName('FlightAnalysis').getOrCreate()
combinedFlightsDF = spark.read.options(header = True, inferSchema = True).csv("FlightData")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/22 19:57:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

In [3]:
# drops unnecessary columns
drop_cols = ("FlightDate","Origin","Dest","Diverted","DepDelayMinutes","ArrDelayMinutes","IATA_Code_Operating_Airline","DestAirportSeqID","DestCityMarketID","DOT_ID_Operating_Airline","DOT_ID_Marketing_Airline","IATA_Code_Marketing_Airline","Operated_or_Branded_Code_Share_Partners","CRSDepTime","DepDelay","AirTime","CRSElapsedTime", "Distance", "Marketing_Airline_Network", "Flight_Number_Marketing_Airline", "Operating_Airline", "Tail_Number", "Flight_Number_Operating_Airline", "OriginAirportID",  "OriginAirportSeqID", "OriginCityMarketID",  "OriginState",  "OriginStateFips",  "OriginStateName", "OriginWac", "DestAirportID", "DestState", "DestStateFips", "DestStateName", "DestWac", "DepartureDelayGroups", "DepTimeBlk", "TaxiOut", "WheelsOff", "WheelsOn", "TaxiIn", "CRSArrTime", "ArrDelay", "ArrivalDelayGroups", "ArrTimeBlk", "DistanceGroup", "DivAirportLandings")
combinedFlightsDF = combinedFlightsDF.drop(*drop_cols)

In [4]:
combinedFlightsDF.show(5)

+---------+---------+-------+-------+-----------------+----+-------+-----+----------+---------+---------------+------------+--------+--------+
|  Airline|Cancelled|DepTime|ArrTime|ActualElapsedTime|Year|Quarter|Month|DayofMonth|DayOfWeek| OriginCityName|DestCityName|DepDel15|ArrDel15|
+---------+---------+-------+-------+-----------------+----+-------+-----+----------+---------+---------------+------------+--------+--------+
|Envoy Air|    false| 1209.0| 1350.0|            101.0|2019|      2|    4|         1|        1|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air|    false| 1200.0| 1348.0|            108.0|2019|      2|    4|         2|        2|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air|    false| 1203.0| 1342.0|             99.0|2019|      2|    4|         3|        3|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air|    false| 1435.0| 1621.0|            106.0|2019|      2|    4|         4|        4|Little Rock, AR| Chicago, IL|     1.0|     1.0|

In [5]:
searchBarDropCols = ("Cancelled","Quarter","DayOfWeek")
searchBarDF = combinedFlightsDF.drop(*searchBarDropCols)

In [6]:
searchBarDF.show(5)

+---------+-------+-------+-----------------+----+-----+----------+---------------+------------+--------+--------+
|  Airline|DepTime|ArrTime|ActualElapsedTime|Year|Month|DayofMonth| OriginCityName|DestCityName|DepDel15|ArrDel15|
+---------+-------+-------+-----------------+----+-----+----------+---------------+------------+--------+--------+
|Envoy Air| 1209.0| 1350.0|            101.0|2019|    4|         1|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1200.0| 1348.0|            108.0|2019|    4|         2|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1203.0| 1342.0|             99.0|2019|    4|         3|Little Rock, AR| Chicago, IL|     0.0|     0.0|
|Envoy Air| 1435.0| 1621.0|            106.0|2019|    4|         4|Little Rock, AR| Chicago, IL|     1.0|     1.0|
|Envoy Air| 1216.0| 1410.0|            114.0|2019|    4|         5|Little Rock, AR| Chicago, IL|     0.0|     0.0|
+---------+-------+-------+-----------------+----+-----+----------+-------------

In [None]:
#Creates csv with specified data (DONT RUN)
combinedFlightsDF.write.option("header",True).csv("combinedFlightsSearchBar")

In [82]:
#dont need to run
# Find # of flights per year
yearCount = combinedFlightsDF.groupBy("Year").count()
yearCount.show()



+----+-------+
|Year|  count|
+----+-------+
|2019|8091684|
|2018|5689521|
|2022|4078327|
|2020|5022397|
|2021|6311871|
+----+-------+



                                                                                

In [7]:
# creating new df to analyze delayed flights by airline, orgin city, and month/year
drop_cols = ("DestCityName", "DayOfMonth", "DepTime", "ArrTime", "ActualElapsedTime")
delayDF = combinedFlightsDF.drop(*drop_cols)
delayDF.show(5)

+---------+---------+----+-------+-----+---------+---------------+--------+--------+
|  Airline|Cancelled|Year|Quarter|Month|DayOfWeek| OriginCityName|DepDel15|ArrDel15|
+---------+---------+----+-------+-----+---------+---------------+--------+--------+
|Envoy Air|    false|2019|      2|    4|        1|Little Rock, AR|     0.0|     0.0|
|Envoy Air|    false|2019|      2|    4|        2|Little Rock, AR|     0.0|     0.0|
|Envoy Air|    false|2019|      2|    4|        3|Little Rock, AR|     0.0|     0.0|
|Envoy Air|    false|2019|      2|    4|        4|Little Rock, AR|     1.0|     1.0|
|Envoy Air|    false|2019|      2|    4|        5|Little Rock, AR|     0.0|     0.0|
+---------+---------+----+-------+-----+---------+---------------+--------+--------+
only showing top 5 rows



In [None]:
delayDF.columns

In [8]:
# uses bitwise or operator to find overall delayed flights
delayDF = delayDF.withColumn("ifDelayed", func.when((delayDF.ArrDel15 > 0) | (delayDF.DepDel15 > 0), 1).otherwise(0))
delayDF.show(10)

+---------+---------+----+-------+-----+---------+---------------+--------+--------+---------+
|  Airline|Cancelled|Year|Quarter|Month|DayOfWeek| OriginCityName|DepDel15|ArrDel15|ifDelayed|
+---------+---------+----+-------+-----+---------+---------------+--------+--------+---------+
|Envoy Air|    false|2019|      2|    4|        1|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        2|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        3|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        4|Little Rock, AR|     1.0|     1.0|        1|
|Envoy Air|    false|2019|      2|    4|        5|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        6|Little Rock, AR|     1.0|     1.0|        1|
|Envoy Air|    false|2019|      2|    4|        7|Little Rock, AR|     0.0|     0.0|        0|
|Envoy Air|    false|2019|      2|    4|        1|

In [39]:
#create column making cancelled into bool (integer) value
delayDF = delayDF.withColumn('ifCancelled', func.when(delayDF.Cancelled == 'false', 0).otherwise(1))
delayDF.show(10)

+---------+---------+----+-------+-----+---------+---------------+--------+--------+---------+-------------+-----------+
|  Airline|Cancelled|Year|Quarter|Month|DayOfWeek| OriginCityName|DepDel15|ArrDel15|ifDelayed|cancelledBool|ifCancelled|
+---------+---------+----+-------+-----+---------+---------------+--------+--------+---------+-------------+-----------+
|Envoy Air|    false|2019|      2|    4|        1|Little Rock, AR|     0.0|     0.0|        0|            0|          0|
|Envoy Air|    false|2019|      2|    4|        2|Little Rock, AR|     0.0|     0.0|        0|            0|          0|
|Envoy Air|    false|2019|      2|    4|        3|Little Rock, AR|     0.0|     0.0|        0|            0|          0|
|Envoy Air|    false|2019|      2|    4|        4|Little Rock, AR|     1.0|     1.0|        1|            0|          0|
|Envoy Air|    false|2019|      2|    4|        5|Little Rock, AR|     0.0|     0.0|        0|            0|          0|
|Envoy Air|    false|2019|      

In [9]:
# groups delayed flights by airline
numDelayedByAirline = delayDF.groupBy("Airline").agg({"ifDelayed" : "sum"})
numDelayedByAirline = numDelayedByAirline.withColumnRenamed("sum(ifDelayed)", "sumDelayedByAirline")
numDelayedByAirline.show(30,truncate=False)

# to check if delayed flight per airline add up to original amount
# sumdf = numDelayed.agg({"sum(ifDelayed)" : "sum"})
# sumdf.show()



+-----------------------------------------+-------------------+
|Airline                                  |sumDelayedByAirline|
+-----------------------------------------+-------------------+
|Endeavor Air Inc.                        |147359             |
|United Air Lines Inc.                    |510700             |
|Compass Airlines                         |31705              |
|Comair Inc.                              |188171             |
|Southwest Airlines Co.                   |1269581            |
|ExpressJet Airlines Inc.                 |79802              |
|JetBlue Airways                          |325465             |
|Empire Airlines Inc.                     |3665               |
|Envoy Air                                |205502             |
|Capital Cargo International              |68581              |
|Hawaiian Airlines Inc.                   |39344              |
|Mesa Airlines Inc.                       |155645             |
|American Airlines Inc.                 

                                                                                

In [10]:
#total flights for each of the 28 airlines
airlineRank = combinedFlightsDF.groupBy("Airline").count() #28 airlines
airlineRank = airlineRank.withColumnRenamed("count", "totalFlightsByAirline")

In [None]:
airlineRank.show()

In [11]:
#1 add a new column to show ratio of delayed flights for each airline
numDelayedByAirline = numDelayedByAirline\
    .join(airlineRank, "Airline")\
    .withColumn("delayedRatio", ((func.col("SumDelayedByAirline") / func.col("totalFlightsByAirline")) * 100))\
    .drop("count")

In [12]:
#1
numDelayedByAirline.show(10)



+--------------------+-------------------+---------------------+------------------+
|             Airline|sumDelayedByAirline|totalFlightsByAirline|      delayedRatio|
+--------------------+-------------------+---------------------+------------------+
|   Endeavor Air Inc.|             147359|               998233|14.761984426481591|
|United Air Lines ...|             510700|              2354538|21.690030061099037|
|    Compass Airlines|              31705|               154985| 20.45681840178082|
|         Comair Inc.|             188171|               957220|19.658072334468564|
|Southwest Airline...|            1269581|              5474339|23.191493986762605|
|ExpressJet Airlin...|              79802|               353669|22.564035864042367|
|     JetBlue Airways|             325465|              1106079|29.425113396059416|
|Empire Airlines Inc.|               3665|                23122|15.850704956318657|
|           Envoy Air|             205502|              1072778|19.156060247

                                                                                

In [13]:
#total flights for each of the origin cities
cityRank = combinedFlightsDF.groupBy("OriginCityName").count() 
cityRank = cityRank.withColumnRenamed("count", "totalFlightsByCity")

In [14]:
# groups delayed flights by city
numDelayedByCity = delayDF.groupBy("OriginCityName").agg({"ifDelayed" : "sum"})
numDelayedByCity = numDelayedByCity.withColumnRenamed("sum(ifDelayed)", "sumDelayedByCity")
numDelayedByCity.show(30,truncate=False)



+------------------------+----------------+
|OriginCityName          |sumDelayedByCity|
+------------------------+----------------+
|Gainesville, FL         |2847            |
|Richmond, VA            |19749           |
|Ontario, CA             |15887           |
|Pago Pago, TT           |113             |
|Tucson, AZ              |11284           |
|Myrtle Beach, SC        |10761           |
|Medford, OR             |5562            |
|Palm Springs, CA        |9311            |
|Durango, CO             |3502            |
|Corpus Christi, TX      |3734            |
|Mobile, AL              |4294            |
|Dubuque, IA             |442             |
|Pensacola, FL           |8954            |
|Huntsville, AL          |6572            |
|Fort Myers, FL          |31883           |
|Columbus, GA            |894             |
|Springfield, IL         |1419            |
|San Juan, PR            |27812           |
|Montrose/Delta, CO      |2692            |
|Lihue, HI               |6384  

                                                                                

In [15]:
#2 add a new column to show ratio of delayed flights for each airline
numDelayedByCity = numDelayedByCity\
    .join(cityRank, "OriginCityName")\
    .withColumn("delayedRatio", ((func.col("SumDelayedByCity") / func.col("totalFlightsByCity")) * 100))\
    .drop("count")

In [16]:
#2
numDelayedByCity.show(5)

                                                                                

+---------------+----------------+------------------+------------------+
| OriginCityName|sumDelayedByCity|totalFlightsByCity|      delayedRatio|
+---------------+----------------+------------------+------------------+
|Gainesville, FL|            2847|             16524| 17.22948438634713|
|   Richmond, VA|           19749|             96878|20.385433225293667|
|    Ontario, CA|           15887|             91776|17.310625871687588|
|  Pago Pago, TT|             113|               299| 37.79264214046823|
|     Tucson, AZ|           11284|             70842|15.928404054092205|
+---------------+----------------+------------------+------------------+
only showing top 5 rows



In [17]:
#4 delayed flights
delayedFlights = delayDF.groupBy("ifDelayed").count()
delayedFlights.show() #1 - Delayed, 0 - Not delayed



+---------+--------+
|ifDelayed|   count|
+---------+--------+
|        1| 6026216|
|        0|23167584|
+---------+--------+



                                                                                

In [18]:
#5 cancelled flights
cancelledFlights = combinedFlightsDF.groupBy("Cancelled").count()
cancelledFlights.show()



+---------+--------+
|Cancelled|   count|
+---------+--------+
|     true|  777267|
|    false|28416533|
+---------+--------+



                                                                                

In [19]:
#categorize flights by date
dayOfWeekRank = combinedFlightsDF.groupBy("DayOfWeek").count()
dayOfWeekRank = dayOfWeekRank.withColumnRenamed("count", "totalFlightsOnDay")

In [20]:
dayOfWeekRank.show()



+---------+-----------------+
|DayOfWeek|totalFlightsOnDay|
+---------+-----------------+
|        1|          4356653|
|        6|          3722792|
|        3|          4124241|
|        5|          4353469|
|        4|          4332719|
|        7|          4253916|
|        2|          4050010|
+---------+-----------------+



                                                                                

In [45]:
# group delayed flights by day of the week
delayedFlightByDay = delayDF.groupBy("DayOfWeek").agg({"ifDelayed" : "sum"})
delayedFlightByDay = delayedFlightByDay.withColumnRenamed("sum(ifDelayed)", "sumDelayedByWeekDay") #TODO NO RENAMING
delayedFlightByDay.show()



+---------+-------------------+
|DayOfWeek|sumDelayedByWeekDay|
+---------+-------------------+
|        1|             917639|
|        6|             726669|
|        3|             787556|
|        5|             978903|
|        4|             948971|
|        7|             901113|
|        2|             765365|
+---------+-------------------+



                                                                                

In [44]:
# group cancelled flights by day of the week
cancelledFlightByDay = delayDF.groupBy("DayOfWeek").agg({"ifCancelled" : "sum"})
cancelledFlightByDay = cancelledFlightByDay.withColumnRenamed("sum(ifCancelled)", "sumCancelledByWeekDay") #TODO NO RENAMING
cancelledFlightByDay.show()



+---------+---------------------+
|DayOfWeek|sumCancelledByWeekDay|
+---------+---------------------+
|        1|               119691|
|        6|                94151|
|        3|               109080|
|        5|               113789|
|        4|               123836|
|        7|               115703|
|        2|               101017|
+---------+---------------------+



                                                                                

In [47]:
analysisByWeekDay = dayOfWeekRank.join(delayedFlightByDay, ["DayOfWeek"]).join(cancelledFlightByDay, ["DayOfWeek"])
analysisByWeekDay.show(5)

                                                                                

+---------+-----------------+-------------------+---------------------+
|DayOfWeek|totalFlightsOnDay|sumDelayedByWeekDay|sumCancelledByWeekDay|
+---------+-----------------+-------------------+---------------------+
|        1|          4356653|             917639|               119691|
|        6|          3722792|             726669|                94151|
|        3|          4124241|             787556|               109080|
|        5|          4353469|             978903|               113789|
|        4|          4332719|             948971|               123836|
+---------+-----------------+-------------------+---------------------+
only showing top 5 rows



In [56]:
analysisByWeekDay = analysisByWeekDay.withColumn("CancelledRatio", (( func.col("sumCancelledByWeekDay") / func.col("totalFlightsOnDay")) * 100))
analysisByWeekDay.show(5)

ERROR:root:KeyboardInterrupt while sending command.>                (0 + 0) / 1]
Traceback (most recent call last):
  File "/Users/jeanatijerina/opt/anaconda3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/jeanatijerina/opt/anaconda3/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/jeanatijerina/opt/anaconda3/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [51]:
analysisByWeekDay = analysisByWeekDay.withColumn("DelayedRatio", ((func.col("sumDelayedByWeekDay") / func.col("totalFlightsOnDay")) * 100))

In [55]:
analysisByWeekDay = analysisByWeekDay.withColumn("Delayed+CancelledRatio", (((func.col("sumDelayedByWeekDay") + func.col("sumCancelledByWeekDay")) / func.col("totalFlightsOnDay")) * 100))

In [53]:
#6 Days of the week analysis
analysisByWeekDay.show()

22/11/22 23:30:01 ERROR Executor: Exception in task 2.0 in stage 98.0 (TID 2374)
java.io.FileNotFoundException: 
File file:/Users/jeanatijerina/Desktop/cs179g/cs179g_Spark/FlightData/Combined_Flights_2019.csv does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:661)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apa

Py4JJavaError: An error occurred while calling o261.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 98.0 failed 1 times, most recent failure: Lost task 3.0 in stage 98.0 (TID 2375) (ucr-secure-21-10-13-134-56.wnet.ucr.edu executor driver): java.io.FileNotFoundException: 
File file:/Users/jeanatijerina/Desktop/cs179g/cs179g_Spark/FlightData/Combined_Flights_2019.csv does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:661)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.io.FileNotFoundException: 
File file:/Users/jeanatijerina/Desktop/cs179g/cs179g_Spark/FlightData/Combined_Flights_2019.csv does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:661)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.hashAgg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
