In [73]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("FlightDataProcessing") \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate()

# Load Parquet file into a DataFrame
df = spark.read.parquet("Combined_Flights_2018.parquet")

# Show schema and data
df.printSchema()
df.show(5)


root
 |-- FlightDate: timestamp (nullable = true)
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: long (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- ArrDelayMinutes: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: long (nullable = true)
 |-- Quarter: long (nullable = true)
 |-- Month: long (nullable = true)
 |-- DayofMonth: long (nullable = true)
 |-- DayOfWeek: long (nullable = true)
 |-- Marketing_Airline_Network: string (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |-- DOT_ID_Mar

In [81]:
columns = df.columns

In [82]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

In [83]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("FlightDelayETL").getOrCreate()

# Load the dataset into a DataFrame (assuming dataset is already downloaded in path)
df = spark.read.parquet("Combined_Flights_2018.parquet")

# Show the first few rows to understand the structure
df.show(5)


+-------------------+-----------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------------+
|         FlightDate|          Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|D

In [12]:
from pyspark.sql.functions import col, to_timestamp, when

# Example of cleaning and transforming the data

df_clean = df \
    .withColumn("FlightDate", to_timestamp(col("FlightDate"))) \
    .withColumn("Cancelled", when(col("Cancelled") == "Yes", 1).otherwise(0)) \
    .withColumn("Diverted", when(col("Diverted") == "Yes", 1).otherwise(0)) \
    .withColumn("DepDelay", when(col("DepDelay").isNull(), 0).otherwise(col("DepDelay"))) \
    .withColumn("ArrDelay", when(col("ArrDelay").isNull(), 0).otherwise(col("ArrDelay"))) \
    .withColumn("Distance", col("Distance").cast("double")) \
    .withColumn("Year", col("Year").cast("long")) \
    .withColumn("Month", col("Month").cast("long")) \
    .withColumn("DayofMonth", col("DayofMonth").cast("long")) \
    .withColumn("DayOfWeek", col("DayOfWeek").cast("long")) \
    .withColumn("DepTime", col("DepTime").cast("double")) \
    .withColumn("ArrTime", col("ArrTime").cast("double")) \
    .withColumn("ArrDelayMinutes", col("ArrDelayMinutes").cast("double")) \
    .dropna(subset=["FlightDate", "Origin", "Dest"])

# Show the transformed data (first few rows)
df_clean.show(5)


+-------------------+-----------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------------+
|         FlightDate|          Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|D

In [14]:
# Save the cleaned data into Parquet format
output_path = "hdfs:///user/talentum/cleaned_flight_data.parquet"
df_clean.write.parquet(output_path, mode="overwrite")

# Verify the saved data
df_loaded = spark.read.parquet(output_path)
df_loaded.show(5)


+-------------------+-----------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------------+
|         FlightDate|          Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|D

In [15]:
# Filter for flights that were not canceled
df_no_cancellation = df_clean.filter(col("Cancelled") == 0)

# Calculate average arrival delay per airline
avg_delay_per_airline = df_clean.groupBy("Airline").avg("ArrDelayMinutes").show()


+--------------------+--------------------+
|             Airline|avg(ArrDelayMinutes)|
+--------------------+--------------------+
|GoJet Airlines, L...|  16.987283181446898|
|   Endeavor Air Inc.|  14.456738946486722|
|       Allegiant Air|   17.54758803260778|
|SkyWest Airlines ...|  15.656154350049608|
|      Virgin America|  11.950970675683553|
|         Horizon Air|    8.27978126451982|
|United Air Lines ...|  14.477765219583194|
|Air Wisconsin Air...|  17.881933762360696|
|Trans States Airl...|  22.335766423357665|
|    Compass Airlines|  14.836996418979409|
|Peninsula Airways...|  15.569902048085485|
|         Comair Inc.|  12.789145931049102|
|Frontier Airlines...|  22.355794184968047|
|            Cape Air|   5.390332326283988|
|Southwest Airline...|  11.119075494383111|
|ExpressJet Airlin...|   17.85813960668087|
|     JetBlue Airways|  19.820095513806514|
|Commutair Aka Cha...|   29.28407567733855|
|Empire Airlines Inc.|  14.458483321334294|
|           Envoy Air|  12.53122

In [115]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CSV to Hive") \
    .enableHiveSupport() \
    .getOrCreate()

csv_path = "hdfs:///user/talentum/flight_2018/Flights_2018_1"

In [116]:
cols_to_drop = [
    "DivReachedDest", "DivActualElapsedTime", "DivArrDelay", "DivDistance", "Div1Airport",
    "Div1AirportID", "Div1AirportSeqID", "Div1WheelsOn", "Div1TotalGTime", "Div1LongestGTime",
    "Div1WheelsOff", "Div1TailNum", "Div2Airport", "Div2AirportID", "Div2AirportSeqID", "Div2WheelsOn",
    "Div2TotalGTime", "Div2LongestGTime", "Div2WheelsOff", "Div2TailNum", "Div3Airport", "Div3AirportID",
    "Div3AirportSeqID", "Div3WheelsOn", "Div3TotalGTime", "Div3LongestGTime", "Div3WheelsOff", "Div3TailNum",
    "Div4Airport", "Div4AirportID", "Div4AirportSeqID", "Div4WheelsOn", "Div4TotalGTime", "Div4LongestGTime",
    "Div4WheelsOff", "Div4TailNum", "Div5Airport", "Div5AirportID", "Div5AirportSeqID", "Div5WheelsOn",
    "Div5TotalGTime", "Div5LongestGTime", "Div5WheelsOff", "Div5TailNum", "Duplicate",
    "Originally_Scheduled_Code_Share_Airline","DOT_ID_Originally_Scheduled_Code_Share_Airline",
    "IATA_Code_Originally_Scheduled_Code_Share_Airline","Flight_Num_Originally_Scheduled_Code_Share_Airline",
    "_c119","Tail_Number","CancellationCode"
]
print(len(cols_to_drop))



52


In [117]:
df = spark.read.option("header", "true").csv(csv_path).drop(*cols_to_drop)
len(df.columns)

68

In [118]:
len(df.columns)

68

In [119]:
df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- FlightDate: string (nullable = true)
 |-- Marketing_Airline_Network: string (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |-- DOT_ID_Marketing_Airline: string (nullable = true)
 |-- IATA_Code_Marketing_Airline: string (nullable = true)
 |-- Flight_Number_Marketing_Airline: string (nullable = true)
 |-- Operating_Airline : string (nullable = true)
 |-- DOT_ID_Operating_Airline: string (nullable = true)
 |-- IATA_Code_Operating_Airline: string (nullable = true)
 |-- Flight_Number_Operating_Airline: string (nullable = true)
 |-- OriginAirportID: string (nullable = true)
 |-- OriginAirportSeqID: string (nullable = true)
 |-- OriginCityMarketID: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- OriginCityName: string (nullable =

In [120]:
from pyspark.sql import functions as F

# Count nulls column-wise for each column individually
for col in df.columns:
    null_count = df.filter(F.col(col).isNull()).count()
    print(f"Column: {col}, Null count: {null_count}")


Column: Year, Null count: 0
Column: Quarter, Null count: 0
Column: Month, Null count: 0
Column: DayofMonth, Null count: 0
Column: DayOfWeek, Null count: 0
Column: FlightDate, Null count: 0
Column: Marketing_Airline_Network, Null count: 0
Column: Operated_or_Branded_Code_Share_Partners, Null count: 0
Column: DOT_ID_Marketing_Airline, Null count: 0
Column: IATA_Code_Marketing_Airline, Null count: 0
Column: Flight_Number_Marketing_Airline, Null count: 0
Column: Operating_Airline , Null count: 0
Column: DOT_ID_Operating_Airline, Null count: 0
Column: IATA_Code_Operating_Airline, Null count: 0
Column: Flight_Number_Operating_Airline, Null count: 0
Column: OriginAirportID, Null count: 0
Column: OriginAirportSeqID, Null count: 0
Column: OriginCityMarketID, Null count: 0
Column: Origin, Null count: 0
Column: OriginCityName, Null count: 0
Column: OriginState, Null count: 0
Column: OriginStateFips, Null count: 0
Column: OriginStateName, Null count: 0
Column: OriginWac, Null count: 0
Column: Dest

In [121]:
# 1. Compute Mode using percentile_approx for certain columns
mode_wheels_off = df.agg(F.expr("percentile_approx(WheelsOff, 0.5)")).collect()[0][0]
mode_wheels_on = df.agg(F.expr("percentile_approx(WheelsOn, 0.5)")).collect()[0][0]
mode_air_time = df.agg(F.expr("percentile_approx(AirTime, 0.5)")).collect()[0][0]
mode_actual_elapsed_time = df.agg(F.expr("percentile_approx(ActualElapsedTime, 0.5)")).collect()[0][0]

# 2. Compute Mean for TaxiOut and TaxiIn
mean_taxi_out = df.agg(F.round(F.avg("TaxiOut"), 0)).collect()[0][0]
mean_taxi_in = df.agg(F.round(F.avg("TaxiIn"), 0)).collect()[0][0]

# 3. Fill missing values with mode and mean for selected columns
df_filled = df.fillna({
    "WheelsOff": mode_wheels_off, 
    "WheelsOn": mode_wheels_on, 
    "AirTime": mode_air_time,
    "ActualElapsedTime": mode_actual_elapsed_time,
    "TaxiOut": mean_taxi_out,
    "TaxiIn": mean_taxi_in
})

# 4. List of columns to impute with 0 (and cast them to double if needed)
columns_to_impute = ["LongestAddGTime", "TotalAddGTime", "FirstDepTime", "LateAircraftDelay",
                     "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "ArrDelay", "ArrDelayMinutes",
                     "ArrDel15", "DepDelay", "DepDelayMinutes", "DepDel15"]

# Cast columns to double before filling them with 0
from pyspark.sql.functions import col

for col_name in columns_to_impute:
    df_filled = df_filled.withColumn(col_name, col(col_name).cast("double"))

# 5. Fill missing values in these columns with 0
df_filled = df_filled.fillna(0, subset=columns_to_impute)

# 6. Convert ArrivalDelayGroups and DepartureDelayGroups to double before filling with -1
df_filled = df_filled.withColumn("ArrivalDelayGroups", col("ArrivalDelayGroups").cast("double")) \
                     .withColumn("DepartureDelayGroups", col("DepartureDelayGroups").cast("double"))

# 7. Fill missing values in ArrivalDelayGroups and DepartureDelayGroups with -1
df_filled = df_filled.fillna(-1, subset=["ArrivalDelayGroups", "DepartureDelayGroups"])

In [122]:
df_cancelled= df_filled.filter(df_filled.Cancelled == 1)
df_cancelled.show(5)

+----+-------+-----+----------+---------+----------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+------------------+------------------------+---------------------------+-------------------------------+---------------+------------------+------------------+------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+--------------------+---------+-------------+--------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+--------+--------------+-----------------+-------+-------+--------+-------------+------------+------------+--------+-------------+-----------------+------------+-------------+---------------+------------------+
|Year|Quarter

In [123]:
df= df.filter(df.Cancelled != 1)
df.show()

+----+-------+-----+----------+---------+----------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+------------------+------------------------+---------------------------+-------------------------------+---------------+------------------+------------------+------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+------------+---------+-------------+-------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+--------+--------------+-----------------+-------+-------+--------+-------------+------------+------------+--------+-------------+-----------------+------------+-------------+---------------+------------------+
|Year|Quarter|Month|DayofMon

In [124]:
for c in df_filled.columns:
    null_count = df_filled.filter(F.col(c).isNull()).count()
    print(f"Column: {c}, Null count: {null_count}")

Column: Year, Null count: 0
Column: Quarter, Null count: 0
Column: Month, Null count: 0
Column: DayofMonth, Null count: 0
Column: DayOfWeek, Null count: 0
Column: FlightDate, Null count: 0
Column: Marketing_Airline_Network, Null count: 0
Column: Operated_or_Branded_Code_Share_Partners, Null count: 0
Column: DOT_ID_Marketing_Airline, Null count: 0
Column: IATA_Code_Marketing_Airline, Null count: 0
Column: Flight_Number_Marketing_Airline, Null count: 0
Column: Operating_Airline , Null count: 0
Column: DOT_ID_Operating_Airline, Null count: 0
Column: IATA_Code_Operating_Airline, Null count: 0
Column: Flight_Number_Operating_Airline, Null count: 0
Column: OriginAirportID, Null count: 0
Column: OriginAirportSeqID, Null count: 0
Column: OriginCityMarketID, Null count: 0
Column: Origin, Null count: 0
Column: OriginCityName, Null count: 0
Column: OriginState, Null count: 0
Column: OriginStateFips, Null count: 0
Column: OriginStateName, Null count: 0
Column: OriginWac, Null count: 0
Column: Dest

In [126]:
# Filter out rows where 'Cancelled' column is 1
df_nc = df_filled.filter(df_filled.Cancelled != 1)


In [114]:
df_nc.show(26)

+----+-------+-----+----------+---------+----------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+------------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+------------+---------+-------------+-------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+--------------+-----------------+-------+-------+--------+-------------+------------+------------+--------+-------------+-----------------+------------+-------------+---------------+------------------+

In [142]:
from pyspark.sql.functions import when
# Fill DepTime with CRSDepTime if null
df_nc = df_nc.withColumn("DepTime", when(col("DepTime").isNull(), col("CRSDepTime")).otherwise(col("DepTime")))

# Fill ArrTime with CRSArrTime if null
df_nc = df_nc.withColumn("ArrTime", when(col("ArrTime").isNull(), col("CRSArrTime")).otherwise(col("ArrTime")))


In [143]:
for c in df_nc.columns:
    null_count = df_nc.filter(F.col(c).isNull()).count()
    print(f"Column: {c}, Null count: {null_count}")

Column: Year, Null count: 0
Column: Quarter, Null count: 0
Column: Month, Null count: 0
Column: DayofMonth, Null count: 0
Column: DayOfWeek, Null count: 0
Column: FlightDate, Null count: 0
Column: Marketing_Airline_Network, Null count: 0
Column: Operated_or_Branded_Code_Share_Partners, Null count: 0
Column: DOT_ID_Marketing_Airline, Null count: 0
Column: IATA_Code_Marketing_Airline, Null count: 0
Column: Flight_Number_Marketing_Airline, Null count: 0
Column: Operating_Airline , Null count: 0
Column: DOT_ID_Operating_Airline, Null count: 0
Column: IATA_Code_Operating_Airline, Null count: 0
Column: Flight_Number_Operating_Airline, Null count: 0
Column: OriginAirportID, Null count: 0
Column: OriginAirportSeqID, Null count: 0
Column: OriginCityMarketID, Null count: 0
Column: Origin, Null count: 0
Column: OriginCityName, Null count: 0
Column: OriginState, Null count: 0
Column: OriginStateFips, Null count: 0
Column: OriginStateName, Null count: 0
Column: OriginWac, Null count: 0
Column: Dest

In [None]:
# Specify the path to save the DataFrame in HDFS
output_path_hdfs = "hdfs:///user/talentum/flight_data.csv"

# Save the DataFrame as a CSV file in HDFS
df_filled.write.option("header", "true").csv(output_path_hdfs)