In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

In [6]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Read CSV") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

# Read CSV into DataFrame
# prices = spark.read.csv("itineraries.csv", header=True, inferSchema=True)

24/05/09 19:07:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
delays_all = spark.read.csv("delays/raw/*.csv", header=True, inferSchema=True)



In [None]:
airline_all = spark.read.csv("delays/delays_again/*.csv", header=True, inferSchema=True)

In [None]:
dictionary = airline_all.rdd.map(lambda row: (row.Operating_Airline, row.Airline)).collectAsMap()

In [None]:
dictionary

In [None]:
delays_all.columns

In [None]:
delays_all.select("Operating_Airline ").count()

In [None]:
from pyspark.sql.functions import avg

cancelled_percentage_df = delays_all.groupBy('Operating_Airline ').agg(
    (avg('Cancelled') * 100).alias('Cancelled_Percentage')
)

# Show the result
cancelled_percentage_df.limit(100).toPandas().head(100)

In [None]:
# Collect the DataFrame as a list of rows
cancelled_percentage_list = cancelled_percentage_df.collect()

# Convert the list of rows to a dictionary with the airline as the key
cancelled_percentage_dict = {row['Operating_Airline ']: row['Cancelled_Percentage'] for row in cancelled_percentage_list}

# Print or use the dictionary as needed
print(cancelled_percentage_dict)


In [None]:
from pyspark.sql.functions import avg

diverted_percentage_df = delays_all.groupBy('Operating_Airline ').agg(
    (avg('Diverted') * 100).alias('Diverted_Percentage')
)

# Show the result
diverted_percentage_df.limit(100).toPandas().head(100)

In [None]:
# Collect the DataFrame as a list of rows
diverted_percentage_list = diverted_percentage_df.collect()

# Convert the list of rows to a dictionary with the airline as the key
diverted_percentage_dict = {row['Operating_Airline ']: row['Diverted_Percentage'] for row in diverted_percentage_list}

# Print or use the dictionary as needed
print(diverted_percentage_dict)


In [None]:
from pyspark.sql.functions import col
delays_all.select(col('Operating_Airline '), col('Cancelled')).limit(10).toPandas().head(10)

In [None]:
# |                    Delta| -> Delta Air Lines Inc.
# |             Boutique Air| -> Compass Airlines
# |             Key Lime Air| -> Peninsula Airways Inc
# |     Sun Country Airlines| -> SkyWest Airlines Inc
# |         Contour Airlines| -> Mesa Airlines Inc
# |                   United| -> United Air Lines Inc.
# |                 Cape Air|
# |        Frontier Airlines| -> Frontier Airlines Inc. 
# |        American Airlines| -> American Airlines Inc.
# |          JetBlue Airways|
# |          Alaska Airlines| -> Alaska Airlines Inc.
# |          Spirit Airlines| -> Spirit Airlines Inc.
# |     Southern Airways ...| -> Southwest Airlines Co
# |        Hawaiian Airlines| -> Hawaiian Airlines Inc.

In [None]:
print(len(set(list(dictionary.values()))))

In [None]:
from pyspark.sql.functions import weekofyear, to_date

delays_all_week = delays_all.withColumn("FlightWeek", weekofyear("FlightDate"))
delays_all_week.limit(100).toPandas().head(100)

In [None]:
delays_all_week = delays_all_week.filter(delays_all_week.Cancelled != 1)

In [None]:
delays_all_week = delays_all_week.withColumnRenamed("Operating_Airline ", "Operating_Airline")

In [None]:
cols = [
    'Origin',
    "Operating_Airline",
    'OriginCityName',
    'Dest',
    'DestCityName',
    'CRSDepTime',
    'DepDelay',
    'DepDelayMinutes',
    'TaxiOut',
    'TaxiIn',
    'CRSArrTime',
    'ArrDelay',
    'Distance',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay',
    'DivArrDelay',
    'FlightWeek',
    "Year"
]

delays_preprocessed = delays_all_week[cols]



In [None]:
delays_preprocessed.limit(100).toPandas().head(100)

In [None]:
delays_preprocessed = delays_preprocessed.fillna(0, subset=["CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"])

In [None]:
delays_preprocessed.limit(100).toPandas().head(100)

In [None]:
from pyspark.sql.functions import col, when

# Import val from col2 to col1 where col1's val is null and col2 is NOT NULL
delays_preprocessed = delays_preprocessed.withColumn("ArrDelay", when(col("ArrDelay").isNull() & col("DivArrDelay").isNotNull(), col("DivArrDelay")).otherwise(col("ArrDelay")))
# Import val from col1 to col2 where col2's val is null and col1 is NOT NULL
delays_preprocessed = delays_preprocessed.withColumn("DivArrDelay", when(col("DivArrDelay").isNull() & col("ArrDelay").isNotNull(), col("ArrDelay")).otherwise(col("DivArrDelay")))

In [26]:
delays_preprocessed.limit(100).toPandas().head(100)

Unnamed: 0,Origin,Operating_Airline,OriginCityName,Dest,DestCityName,CRSDepTime,DepDelay,DepDelayMinutes,TaxiOut,TaxiIn,...,ArrDelay,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,DivArrDelay,FlightWeek,Year
0,ABY,9E,"Albany, GA",ATL,"Atlanta, GA",1202,-5.0,0.0,14.0,7.0,...,-8.0,145.0,0.0,0.0,0.0,0.0,0.0,-8.0,4,2018
1,ABY,9E,"Albany, GA",ATL,"Atlanta, GA",1202,-5.0,0.0,13.0,12.0,...,-6.0,145.0,0.0,0.0,0.0,0.0,0.0,-6.0,4,2018
2,ABY,9E,"Albany, GA",ATL,"Atlanta, GA",1202,-9.0,0.0,18.0,11.0,...,-2.0,145.0,0.0,0.0,0.0,0.0,0.0,-2.0,4,2018
3,ABY,9E,"Albany, GA",ATL,"Atlanta, GA",1202,-12.0,0.0,17.0,11.0,...,-11.0,145.0,0.0,0.0,0.0,0.0,0.0,-11.0,4,2018
4,ABY,9E,"Albany, GA",ATL,"Atlanta, GA",1400,-5.0,0.0,17.0,11.0,...,-1.0,145.0,0.0,0.0,0.0,0.0,0.0,-1.0,4,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,LGA,9E,"New York, NY",SYR,"Syracuse, NY",1810,-12.0,0.0,24.0,6.0,...,-25.0,198.0,0.0,0.0,0.0,0.0,0.0,-25.0,3,2018
96,LGA,9E,"New York, NY",SYR,"Syracuse, NY",1810,-12.0,0.0,24.0,5.0,...,-26.0,198.0,0.0,0.0,0.0,0.0,0.0,-26.0,4,2018
97,ATL,9E,"Atlanta, GA",EWN,"New Bern/Morehead/Beaufort, NC",1225,-3.0,0.0,17.0,3.0,...,-4.0,433.0,0.0,0.0,0.0,0.0,0.0,-4.0,1,2018
98,ATL,9E,"Atlanta, GA",EWN,"New Bern/Morehead/Beaufort, NC",1225,6.0,6.0,26.0,6.0,...,15.0,433.0,0.0,0.0,9.0,0.0,6.0,15.0,1,2018


In [27]:
delays_preprocessed.filter(delays_preprocessed['ArrDelay'].isNull()).count()

                                                                                

9465

In [28]:
from pyspark.sql.functions import mean

mean_value = delays_preprocessed.agg(mean('ArrDelay')).collect()[0][0]
delays_preprocessed = delays_preprocessed.fillna(mean_value, subset=['ArrDelay'])

                                                                                

In [29]:
delays_preprocessed.filter(delays_preprocessed['ArrDelay'].isNull()).count()

0

In [30]:
delays_preprocessed = delays_preprocessed.drop("DivArrDelay")

In [31]:
delays_preprocessed.columns

['Origin',
 'Operating_Airline',
 'OriginCityName',
 'Dest',
 'DestCityName',
 'CRSDepTime',
 'DepDelay',
 'DepDelayMinutes',
 'TaxiOut',
 'TaxiIn',
 'CRSArrTime',
 'ArrDelay',
 'Distance',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay',
 'FlightWeek',
 'Year']

In [32]:
delays_preprocessed.printSchema()

root
 |-- Origin: string (nullable = true)
 |-- Operating_Airline: string (nullable = true)
 |-- OriginCityName: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- DestCityName: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- TaxiOut: double (nullable = true)
 |-- TaxiIn: double (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- ArrDelay: double (nullable = false)
 |-- Distance: double (nullable = true)
 |-- CarrierDelay: double (nullable = false)
 |-- WeatherDelay: double (nullable = false)
 |-- NASDelay: double (nullable = false)
 |-- SecurityDelay: double (nullable = false)
 |-- LateAircraftDelay: double (nullable = false)
 |-- FlightWeek: integer (nullable = true)
 |-- Year: integer (nullable = true)



In [33]:
from pyspark.sql.functions import when

delays_preprocessed = delays_preprocessed.withColumn(
    "depDaypart",
    when((delays_preprocessed["CRSDepTime"] >= 600) & (delays_preprocessed["CRSDepTime"] < 1200), "morning")
    .when((delays_preprocessed["CRSDepTime"] >= 1200) & (delays_preprocessed["CRSDepTime"] < 1600), "afternoon")
    .when((delays_preprocessed["CRSDepTime"] >= 1600) & (delays_preprocessed["CRSDepTime"] < 1900), "evening")
    .otherwise("night")
)

delays_preprocessed = delays_preprocessed.withColumn(
    "arrDaypart",
    when((delays_preprocessed["CRSArrTime"] >= 600) & (delays_preprocessed["CRSArrTime"] < 1200), "morning")
    .when((delays_preprocessed["CRSArrTime"] >= 1200) & (delays_preprocessed["CRSArrTime"] < 1600), "afternoon")
    .when((delays_preprocessed["CRSArrTime"] >= 1600) & (delays_preprocessed["CRSArrTime"] < 1900), "evening")
    .otherwise("night")
)


In [34]:
delays_preprocessed.select(col("CRSDepTime"), col("depDaypart"), col("CRSArrTime"), col("arrDaypart")).limit(20).toPandas().head(20)

Unnamed: 0,CRSDepTime,depDaypart,CRSArrTime,arrDaypart
0,1202,afternoon,1304,afternoon
1,1202,afternoon,1304,afternoon
2,1202,afternoon,1304,afternoon
3,1202,afternoon,1304,afternoon
4,1400,afternoon,1500,afternoon
5,1202,afternoon,1304,afternoon
6,1202,afternoon,1304,afternoon
7,1202,afternoon,1304,afternoon
8,1202,afternoon,1304,afternoon
9,1037,morning,1137,morning


In [35]:
delays_preprocessed = delays_preprocessed.drop("CRSDepTime").drop("CRSArrTime")

In [36]:
delays_preprocessed.columns

['Origin',
 'Operating_Airline',
 'OriginCityName',
 'Dest',
 'DestCityName',
 'DepDelay',
 'DepDelayMinutes',
 'TaxiOut',
 'TaxiIn',
 'ArrDelay',
 'Distance',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay',
 'FlightWeek',
 'Year',
 'depDaypart',
 'arrDaypart']

In [63]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline


indexers = [StringIndexer(inputCol=column, outputCol=column+"_index", handleInvalid="keep") for column in ["Origin", "OriginCityName", "Dest", "DestCityName", "FlightWeek", "arrDaypart", "depDaypart"]]

# Perform OneHotEncoding on the indexed columns
encoders = [OneHotEncoder(inputCol=column+"_index", outputCol=column+"_onehot") for column in ["Origin", "OriginCityName", "Dest", "DestCityName", "FlightWeek", "arrDaypart", "depDaypart"]]

# Create a pipeline to execute the indexers and encoders sequentially
pipeline = Pipeline(stages=indexers + encoders)

# Fit the pipeline to the data and transform the DataFrame
model = pipeline.fit(delays_preprocessed)
encoded_df = model.transform(delays_preprocessed)

# Show the encoded DataFrame
encoded_df.show()


                                                                                

+------+--------------+----+------------+--------+---------------+-------+------+-----------------+--------+------------+------------+--------+-------------+-----------------+----------+----+----------+----------+------------+--------------------+----------+------------------+----------------+----------------+----------------+-----------------+---------------------+-----------------+-------------------+-----------------+-----------------+-----------------+
|Origin|OriginCityName|Dest|DestCityName|DepDelay|DepDelayMinutes|TaxiOut|TaxiIn|         ArrDelay|Distance|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|FlightWeek|Year|depDaypart|arrDaypart|Origin_index|OriginCityName_index|Dest_index|DestCityName_index|FlightWeek_index|arrDaypart_index|depDaypart_index|    Origin_onehot|OriginCityName_onehot|      Dest_onehot|DestCityName_onehot|FlightWeek_onehot|arrDaypart_onehot|depDaypart_onehot|
+------+--------------+----+------------+--------+---------------+-------+----

In [64]:
encoded_df.limit(10).toPandas().head(20)

Unnamed: 0,Origin,OriginCityName,Dest,DestCityName,DepDelay,DepDelayMinutes,TaxiOut,TaxiIn,ArrDelay,Distance,...,FlightWeek_index,arrDaypart_index,depDaypart_index,Origin_onehot,OriginCityName_onehot,Dest_onehot,DestCityName_onehot,FlightWeek_onehot,arrDaypart_onehot,depDaypart_onehot
0,ABY,"Albany, GA",ATL,"Atlanta, GA",-5.0,0.0,14.0,7.0,-8.0,145.0,...,8.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
1,ABY,"Albany, GA",ATL,"Atlanta, GA",-5.0,0.0,13.0,12.0,-6.0,145.0,...,8.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
2,ABY,"Albany, GA",ATL,"Atlanta, GA",-9.0,0.0,18.0,11.0,-2.0,145.0,...,8.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
3,ABY,"Albany, GA",ATL,"Atlanta, GA",-12.0,0.0,17.0,11.0,-11.0,145.0,...,8.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
4,ABY,"Albany, GA",ATL,"Atlanta, GA",-5.0,0.0,17.0,11.0,-1.0,145.0,...,8.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
5,ABY,"Albany, GA",ATL,"Atlanta, GA",,,34.0,13.0,22.0,145.0,...,8.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
6,ABY,"Albany, GA",ATL,"Atlanta, GA",2.0,2.0,15.0,10.0,-1.0,145.0,...,20.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
7,ABY,"Albany, GA",ATL,"Atlanta, GA",-9.0,0.0,7.0,11.0,-9.0,145.0,...,20.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
8,ABY,"Albany, GA",ATL,"Atlanta, GA",-9.0,0.0,26.0,8.0,4.131181,145.0,...,20.0,2.0,1.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 0.0)"
9,ATL,"Atlanta, GA",ABY,"Albany, GA",24.0,24.0,23.0,3.0,22.0,145.0,...,7.0,1.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)"


In [65]:
encoded_df.columns

['Origin',
 'OriginCityName',
 'Dest',
 'DestCityName',
 'DepDelay',
 'DepDelayMinutes',
 'TaxiOut',
 'TaxiIn',
 'ArrDelay',
 'Distance',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay',
 'FlightWeek',
 'Year',
 'depDaypart',
 'arrDaypart',
 'Origin_index',
 'OriginCityName_index',
 'Dest_index',
 'DestCityName_index',
 'FlightWeek_index',
 'arrDaypart_index',
 'depDaypart_index',
 'Origin_onehot',
 'OriginCityName_onehot',
 'Dest_onehot',
 'DestCityName_onehot',
 'FlightWeek_onehot',
 'arrDaypart_onehot',
 'depDaypart_onehot']

In [66]:
req_cols = [
     'Year',
     'Origin_onehot',
     'OriginCityName_onehot',
     'Dest_onehot',
     'DestCityName_onehot',
     'FlightWeek_onehot',
     'arrDaypart_onehot',
     'depDaypart_onehot',
     'DepDelay',
     'DepDelayMinutes',
     'TaxiOut',
     'TaxiIn',
     'Distance',
     'CarrierDelay',
     'WeatherDelay',
     'NASDelay',
     'SecurityDelay',
     'LateAircraftDelay',
     'ArrDelay',
]

useable_df = encoded_df[req_cols]


In [67]:
useable_df.columns[:-1]

['Year',
 'Origin_onehot',
 'OriginCityName_onehot',
 'Dest_onehot',
 'DestCityName_onehot',
 'FlightWeek_onehot',
 'arrDaypart_onehot',
 'depDaypart_onehot',
 'DepDelay',
 'DepDelayMinutes',
 'TaxiOut',
 'TaxiIn',
 'Distance',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay']

In [37]:
output_path = "/Users/shreyasmac/Documents/Notes/Big Data/Final Project/delays_preprocessed_updated"
# Save the DataFrame to a CSV file
delays_preprocessed.write.csv(output_path, header=True, mode="overwrite")

                                                                                

In [139]:
# REMOVE AFTER THIS

In [114]:
train_data_df = useable_df.filter(col("Year") != 2022)
test_data_df = useable_df.filter(col("Year") == 2022)


In [116]:
# train_data_df.show()

In [136]:
pwd

'/Users/shreyasmac/Documents/Notes/Big Data/Final Project'

                                                                                

In [117]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


# Combine features into a single vector column
trainAssembler = VectorAssembler(inputCols=train_data_df.columns[:-1],
                            outputCol="features")
trainOutput = trainAssembler.transform(train_data_df)


testAssembler = VectorAssembler(inputCols=test_data_df.columns[:-1],
                            outputCol="features")
testOutput = testAssembler.transform(test_data_df)

In [130]:
train_abcd = trainOutput[["features"]]
test_abcd = testOutput[["features"]]

In [132]:
train_abcd.limit(2).show()

+--------------------+
|            features|
+--------------------+
|(1610,[0,273,655,...|
|(1610,[0,273,655,...|
+--------------------+



In [102]:
train_data = trainOutput
test_data = testOutput

print("Train data: ", train_data.count())
print("Test data: ", test_data.count())


                                                                                

Train data:  24461389




Test data:  3955126


                                                                                

In [103]:
train_data.select(col("Year")).distinct().show()



+----+
|Year|
+----+
|2018|
|2019|
|2020|
|2021|
+----+



                                                                                

In [104]:
test_data.select(col("Year")).distinct().show()



+----+
|Year|
+----+
|2022|
+----+



                                                                                

In [112]:
trainOutput.show()

24/04/27 19:27:48 ERROR Executor: Exception in task 0.0 in stage 159.0 (TID 4549)
org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda/0x000000980226e288`: (struct<Year_double_VectorAssembler_d8f1f30441b5:double,Origin_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,OriginCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,Dest_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DestCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,FlightWeek_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,arrDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,depDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DepDelay:double,DepDelayMinutes:double,TaxiOut:double,TaxiIn:double,Distance:double,CarrierDelay:double,Weath

Py4JJavaError: An error occurred while calling o2621.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 159.0 failed 1 times, most recent failure: Lost task 0.0 in stage 159.0 (TID 4549) (10-18-165-231.dynapool.wireless.nyu.edu executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda/0x000000980226e288`: (struct<Year_double_VectorAssembler_d8f1f30441b5:double,Origin_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,OriginCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,Dest_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DestCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,FlightWeek_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,arrDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,depDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DepDelay:double,DepDelayMinutes:double,TaxiOut:double,TaxiIn:double,Distance:double,CarrierDelay:double,WeatherDelay:double,NASDelay:double,SecurityDelay:double,LateAircraftDelay:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3537)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at jdk.internal.reflect.GeneratedMethodAccessor138.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda/0x000000980226e288`: (struct<Year_double_VectorAssembler_d8f1f30441b5:double,Origin_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,OriginCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,Dest_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DestCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,FlightWeek_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,arrDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,depDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DepDelay:double,DepDelayMinutes:double,TaxiOut:double,TaxiIn:double,Distance:double,CarrierDelay:double,WeatherDelay:double,NASDelay:double,SecurityDelay:double,LateAircraftDelay:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 20 more


In [105]:
# Create a RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features", labelCol="ArrDelay", numTrees=10)

# Train the model
model = rf.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)



24/04/27 19:26:11 ERROR Executor: Exception in task 0.0 in stage 155.0 (TID 4537)
org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda/0x000000980226e288`: (struct<Year_double_VectorAssembler_9660347001c1:double,Origin_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,OriginCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,Dest_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DestCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,FlightWeek_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,arrDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,depDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DepDelay:double,DepDelayMinutes:double,TaxiOut:double,TaxiIn:double,Distance:double,CarrierDelay:double,Weath

Py4JJavaError: An error occurred while calling o2506.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 155.0 failed 1 times, most recent failure: Lost task 0.0 in stage 155.0 (TID 4537) (10-18-165-231.dynapool.wireless.nyu.edu executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda/0x000000980226e288`: (struct<Year_double_VectorAssembler_9660347001c1:double,Origin_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,OriginCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,Dest_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DestCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,FlightWeek_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,arrDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,depDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DepDelay:double,DepDelayMinutes:double,TaxiOut:double,TaxiIn:double,Distance:double,CarrierDelay:double,WeatherDelay:double,NASDelay:double,SecurityDelay:double,LateAircraftDelay:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$aggregate$2(RDD.scala:1226)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 28 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2493)
	at org.apache.spark.rdd.RDD.$anonfun$aggregate$1(RDD.scala:1228)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1221)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:125)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:274)
	at org.apache.spark.ml.regression.RandomForestRegressor.$anonfun$train$1(RandomForestRegressor.scala:158)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:136)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda/0x000000980226e288`: (struct<Year_double_VectorAssembler_9660347001c1:double,Origin_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,OriginCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,Dest_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DestCityName_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,FlightWeek_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,arrDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,depDaypart_onehot:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,DepDelay:double,DepDelayMinutes:double,TaxiOut:double,TaxiIn:double,Distance:double,CarrierDelay:double,WeatherDelay:double,NASDelay:double,SecurityDelay:double,LateAircraftDelay:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$aggregate$2(RDD.scala:1226)
	at org.apache.spark.SparkContext.$anonfun$runJob$6(SparkContext.scala:2492)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 28 more
