In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Flight Delay Analysis") \
    .master("local[*]") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/05 15:07:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("hdfs://localhost:9000/user/ketkimaddiwar/airline_data/*.csv")

df.show(5)
df.printSchema()


25/05/05 15:07:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----+-------+-----+----------+---------+----------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+------------+---------+-------------+-------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+--------------+-----------------+-------+-------+--------+-------------+------------+------------+--------+-------------+-----------------+------------+-------------+---------------+------------------+--------------+--------------------+-----------+-----------+-----------+-------------+----------------+------------+--------------+----------------+----

25/05/05 15:07:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Year, Quarter, Month, DayofMonth, DayOfWeek, FlightDate, Reporting_Airline, DOT_ID_Reporting_Airline, IATA_CODE_Reporting_Airline, Tail_Number, Flight_Number_Reporting_Airline, OriginAirportID, OriginAirportSeqID, OriginCityMarketID, Origin, OriginCityName, OriginState, OriginStateFips, OriginStateName, OriginWac, DestAirportID, DestAirportSeqID, DestCityMarketID, Dest, DestCityName, DestState, DestStateFips, DestStateName, DestWac, CRSDepTime, DepTime, DepDelay, DepDelayMinutes, DepDel15, DepartureDelayGroups, DepTimeBlk, TaxiOut, WheelsOff, WheelsOn, TaxiIn, CRSArrTime, ArrTime, ArrDelay, ArrDelayMinutes, ArrDel15, ArrivalDelayGroups, ArrTimeBlk, Cancelled, CancellationCode, Diverted, CRSElapsedTime, ActualElapsedTime, AirTime, Flights, Distance, DistanceGroup, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay, FirstDepTime, TotalAddGTime, LongestAddGTime, DivAirportLandi

In [6]:
from pyspark.sql.functions import col

# Filter rows with WeatherDelay > 0
weather_delay_df = df.filter(col("WeatherDelay") > 0)

# Count the number of such flights
weather_delay_count = weather_delay_df.count()

print("Total number of flights delayed due to weather:", weather_delay_count)




Total number of flights delayed due to weather: 88905




In [7]:
minimal_columns = [
    "FlightDate", "Origin", "Dest",
    "DepDelay", "ArrDelay", "WeatherDelay"
]


In [8]:
from pyspark.sql.functions import col

df_weather_focused = weather_delay_df.select([col(c) for c in minimal_columns])
df_weather_focused.show(5)


+----------+------+----+--------+--------+------------+
|FlightDate|Origin|Dest|DepDelay|ArrDelay|WeatherDelay|
+----------+------+----+--------+--------+------------+
|2024-06-21|   FSD| MSP|    33.0|    27.0|         7.0|
|2024-06-29|   MCI| JFK|    22.0|    29.0|        14.0|
|2024-06-25|   ROC| DTW|    48.0|    38.0|         7.0|
|2024-06-20|   PWM| LGA|   146.0|   138.0|       138.0|
|2024-06-07|   DSM| MSP|    35.0|    41.0|        13.0|
+----------+------+----+--------+--------+------------+
only showing top 5 rows



In [15]:
import pandas as pd

# Load airport info
airports_df = pd.read_csv("airports.csv")

# Filter valid IATA codes only
airports_df = airports_df[airports_df["iata_code"].notnull()]

# Build IATA → (lat, lon) mapping
iata_coords = dict(zip(
    airports_df["iata_code"],
    zip(airports_df["latitude_deg"], airports_df["longitude_deg"])
))

# Sample output
print("Example:", {k: iata_coords[k] for k in list(iata_coords)[:5]})


Example: {'UTK': (11.222219, 169.851429), 'OCA': (25.325399398804, -80.274803161621), 'CUX': (27.7211, -97.512802), 'CSE': (38.851918, -106.928341), 'CUS': (31.823898, -107.629924)}


In [16]:
from pyspark.sql.functions import col, to_date

# Convert date column
df = df.withColumn("FlightDate", to_date(col("FlightDate"), "yyyy-MM-dd"))

# Filter weather delays
weather_delay_df = df.filter(col("WeatherDelay") > 0)

# Extract unique IATA codes
airport_codes = weather_delay_df.select("Origin").union(weather_delay_df.select("Dest")) \
    .distinct().rdd.flatMap(lambda x: x).collect()

print("Airports with weather delays:", airport_codes)




Airports with weather delays: ['MSY', 'GEG', 'SNA', 'BUR', 'GRB', 'IDA', 'GRR', 'PVU', 'MYR', 'GSO', 'PVD', 'OAK', 'FAR', 'MSN', 'FSM', 'MQT', 'DCA', 'RFD', 'CID', 'MLU', 'IAG', 'RDM', 'LEX', 'ORF', 'KTN', 'EVV', 'CWA', 'SAV', 'GCK', 'TRI', 'CMH', 'CAK', 'MOB', 'PNS', 'LIH', 'IAH', 'HNL', 'SHV', 'CVG', 'SJC', 'TOL', 'LGA', 'TLH', 'BUF', 'ACT', 'HPN', 'AUS', 'MLI', 'SJU', 'ATW', 'AVL', 'GJT', 'BFL', 'RNO', 'SRQ', 'EYW', 'SBN', 'RST', 'JAC', 'CHS', 'HGR', 'TUL', 'RSW', 'HRL', 'AMA', 'BOS', 'MLB', 'MAF', 'EWR', 'LAS', 'BIS', 'ITO', 'JAN', 'XNA', 'HHH', 'DLH', 'DEN', 'ALB', 'OME', 'PSP', 'SBA', 'BOI', 'IAD', 'BMI', 'SEA', 'CMI', 'VLD', 'PSM', 'MCI', 'GRK', 'CLT', 'BNA', 'CLL', 'ORH', 'PBI', 'ABQ', 'PIE', 'SDF', 'BDL', 'ITH', 'MRY', 'DAL', 'USA', 'CLE', 'PDX', 'MIA', 'BWI', 'TPA', 'ROA', 'OKC', 'SMF', 'SFB', 'ABI', 'ELM', 'PHX', 'FCA', 'PWM', 'STL', 'DFW', 'ABE', 'TXK', 'GSP', 'STX', 'LBB', 'CRP', 'EGE', 'FSD', 'SWF', 'SFO', 'MEM', 'SAF', 'ELP', 'BHM', 'ATL', 'FLL', 'FNT', 'AZA', 'RIC', 'LC

                                                                                

In [19]:
def fetch_weather_for_airport(iata_code, start_date, end_date):
    if iata_code not in iata_coords:
        print(f"❌ No coords for {iata_code}")
        return pd.DataFrame()

    lat, lon = iata_coords[iata_code]
    station = Stations().nearby(lat, lon).fetch(1)

    if station.empty:
        print(f"❌ No station near {iata_code}")
        return pd.DataFrame()

    station_id = station.index[0]
    data = Daily(station_id, start_date, end_date)
    df = data.fetch()

    if df.empty:
        print(f"❌ No data for {iata_code}")
        return pd.DataFrame()

    df = df.reset_index()
    df["Airport"] = iata_code

    # Rename if columns exist
    if "tavg" in df.columns:
        df.rename(columns={"tavg": "TempAvg"}, inplace=True)
    else:
        df["TempAvg"] = None

    if "wspd" in df.columns:
        df.rename(columns={"wspd": "WindSpeedKph"}, inplace=True)
    else:
        df["WindSpeedKph"] = None

    if "vis" in df.columns:
        df.rename(columns={"vis": "VisibilityKm"}, inplace=True)
    else:
        df["VisibilityKm"] = None

    df.rename(columns={"time": "WeatherDate"}, inplace=True)

    return df[["WeatherDate", "Airport", "TempAvg", "VisibilityKm", "WindSpeedKph"]]


In [20]:
start = datetime(2024, 6, 1)
end = datetime(2024, 6, 30)

weather_dataframes = []

for code in airport_codes:
    print(f"📡 Fetching weather for {code}...")
    wdf = fetch_weather_for_airport(code, start, end)
    if not wdf.empty:
        weather_dataframes.append(wdf)


📡 Fetching weather for MSY...
📡 Fetching weather for GEG...
📡 Fetching weather for SNA...
📡 Fetching weather for BUR...
📡 Fetching weather for GRB...
📡 Fetching weather for IDA...
📡 Fetching weather for GRR...
📡 Fetching weather for PVU...
📡 Fetching weather for MYR...
📡 Fetching weather for GSO...
📡 Fetching weather for PVD...
📡 Fetching weather for OAK...
📡 Fetching weather for FAR...
📡 Fetching weather for MSN...
📡 Fetching weather for FSM...
❌ No data for FSM
📡 Fetching weather for MQT...
📡 Fetching weather for DCA...
📡 Fetching weather for RFD...
📡 Fetching weather for CID...
📡 Fetching weather for MLU...
📡 Fetching weather for IAG...
📡 Fetching weather for RDM...
📡 Fetching weather for LEX...
📡 Fetching weather for ORF...
📡 Fetching weather for KTN...
📡 Fetching weather for EVV...
📡 Fetching weather for CWA...
📡 Fetching weather for SAV...
📡 Fetching weather for GCK...
📡 Fetching weather for TRI...
📡 Fetching weather for CMH...
📡 Fetching weather for CAK...
📡 Fetching weather for



❌ No data for HNL
📡 Fetching weather for SHV...
📡 Fetching weather for CVG...
📡 Fetching weather for SJC...
📡 Fetching weather for TOL...
📡 Fetching weather for LGA...
📡 Fetching weather for TLH...
📡 Fetching weather for BUF...
📡 Fetching weather for ACT...
📡 Fetching weather for HPN...
📡 Fetching weather for AUS...
📡 Fetching weather for MLI...
📡 Fetching weather for SJU...
📡 Fetching weather for ATW...
📡 Fetching weather for AVL...
📡 Fetching weather for GJT...
📡 Fetching weather for BFL...
📡 Fetching weather for RNO...
📡 Fetching weather for SRQ...
📡 Fetching weather for EYW...
📡 Fetching weather for SBN...
📡 Fetching weather for RST...
📡 Fetching weather for JAC...
📡 Fetching weather for CHS...
📡 Fetching weather for HGR...
📡 Fetching weather for TUL...
📡 Fetching weather for RSW...
📡 Fetching weather for HRL...
📡 Fetching weather for AMA...
📡 Fetching weather for BOS...
📡 Fetching weather for MLB...
📡 Fetching weather for MAF...
📡 Fetching weather for EWR...
📡 Fetching weather for

📡 Fetching weather for CYS...
📡 Fetching weather for HLN...
📡 Fetching weather for DIK...
📡 Fetching weather for OTH...
📡 Fetching weather for GUC...
📡 Fetching weather for RDD...
📡 Fetching weather for RKS...
📡 Fetching weather for ABY...
📡 Fetching weather for WRG...
📡 Fetching weather for LBE...
📡 Fetching weather for LWS...
📡 Fetching weather for JMS...
📡 Fetching weather for CKB...
📡 Fetching weather for SHR...
📡 Fetching weather for SMX...
📡 Fetching weather for BGM...
📡 Fetching weather for PSE...
📡 Fetching weather for MEI...
📡 Fetching weather for HYS...
📡 Fetching weather for TWF...
📡 Fetching weather for PIH...
📡 Fetching weather for GST...
📡 Fetching weather for SPN...
📡 Fetching weather for EWN...
📡 Fetching weather for BIH...
📡 Fetching weather for HYA...
📡 Fetching weather for ACK...
📡 Fetching weather for EKO...
📡 Fetching weather for MCW...
📡 Fetching weather for EAR...
📡 Fetching weather for EAU...
📡 Fetching weather for JST...
📡 Fetching weather for FOD...
📡 Fetching

In [22]:
# Combine all into one Pandas DataFrame
all_weather_df = pd.concat(weather_dataframes)

# Save locally if needed
all_weather_df.to_csv("weather_data.csv", index=False)

# Convert to Spark DataFrame
import pyspark.sql.types as T

# Fill NaN with None
all_weather_df = all_weather_df.where(pd.notnull(all_weather_df), None)

# Define schema explicitly
weather_schema = T.StructType([
    T.StructField("WeatherDate", T.DateType(), True),
    T.StructField("Airport", T.StringType(), True),
    T.StructField("TempAvg", T.FloatType(), True),
    T.StructField("VisibilityKm", T.FloatType(), True),
    T.StructField("WindSpeedKph", T.FloatType(), True),
])

# Convert to Spark DataFrame
weather_spark_df = spark.createDataFrame(all_weather_df, schema=weather_schema)
weather_spark_df.show(5)



+-----------+-------+-------+------------+------------+
|WeatherDate|Airport|TempAvg|VisibilityKm|WindSpeedKph|
+-----------+-------+-------+------------+------------+
| 2024-06-01|    MSY|   26.8|        NULL|        11.2|
| 2024-06-02|    MSY|   25.8|        NULL|         9.7|
| 2024-06-03|    MSY|   28.1|        NULL|        12.2|
| 2024-06-04|    MSY|   28.6|        NULL|        16.2|
| 2024-06-05|    MSY|   27.1|        NULL|        13.0|
+-----------+-------+-------+------------+------------+
only showing top 5 rows



In [26]:
origin_weather = weather_spark_df.withColumnRenamed("Airport", "Origin") \
    .withColumnRenamed("TempAvg", "Origin_TempAvg") \
    .withColumnRenamed("VisibilityKm", "Origin_VisibilityKm") \
    .withColumnRenamed("WindSpeedKph", "Origin_WindSpeedKph")

flights_with_origin_weather = weather_delay_df.join(
    origin_weather,
    (weather_delay_df.FlightDate == origin_weather.WeatherDate) &
    (weather_delay_df.Origin == origin_weather.Origin),
    how="left"
).drop(origin_weather["Origin"]).drop(origin_weather["WeatherDate"])


In [27]:
dest_weather = weather_spark_df.withColumnRenamed("Airport", "Dest") \
    .withColumnRenamed("TempAvg", "Dest_TempAvg") \
    .withColumnRenamed("VisibilityKm", "Dest_VisibilityKm") \
    .withColumnRenamed("WindSpeedKph", "Dest_WindSpeedKph")

flights_with_full_weather = flights_with_origin_weather.join(
    dest_weather,
    (flights_with_origin_weather.FlightDate == dest_weather.WeatherDate) &
    (flights_with_origin_weather.Dest == dest_weather.Dest),
    how="left"
).drop(dest_weather["Dest"]).drop(dest_weather["WeatherDate"])


In [28]:
flights_with_full_weather.select(
    "FlightDate", "Origin", "Dest", "DepDelay", "ArrDelay", "WeatherDelay",
    "Origin_TempAvg", "Origin_VisibilityKm", "Origin_WindSpeedKph",
    "Dest_TempAvg", "Dest_VisibilityKm", "Dest_WindSpeedKph"
).show(10)


+----------+------+----+--------+--------+------------+--------------+-------------------+-------------------+------------+-----------------+-----------------+
|FlightDate|Origin|Dest|DepDelay|ArrDelay|WeatherDelay|Origin_TempAvg|Origin_VisibilityKm|Origin_WindSpeedKph|Dest_TempAvg|Dest_VisibilityKm|Dest_WindSpeedKph|
+----------+------+----+--------+--------+------------+--------------+-------------------+-------------------+------------+-----------------+-----------------+
|2024-06-25|   ROC| DTW|    48.0|    38.0|         7.0|          22.3|               NULL|               17.6|        22.6|             NULL|             19.1|
|2024-06-07|   DSM| MSP|    35.0|    41.0|        13.0|          22.3|               NULL|                7.9|        19.5|             NULL|             13.3|
|2024-06-14|   ILM| LGA|   110.0|    90.0|        77.0|          26.1|               NULL|               11.9|        23.0|             NULL|              7.0|
|2024-06-29|   MCI| JFK|    22.0|    29.

In [31]:
from pyspark.sql.functions import when, col

flights_classified = flights_with_full_weather.withColumn(
    "Origin_WeatherCondition",
    when(col("Origin_WindSpeedKph").isNotNull() & (col("Origin_WindSpeedKph") > 25), "Windy")
    .when(col("Origin_TempAvg").isNotNull() & (col("Origin_TempAvg") < 5), "Cold")
    .when(col("Origin_TempAvg").isNotNull() & (col("Origin_TempAvg") > 30), "Hot")
    .when(
        col("Origin_TempAvg").isNull() & col("Origin_WindSpeedKph").isNull(),
        "Unknown"
    )
    .otherwise("Clear")
)


In [35]:
flights_classified.select(
    "FlightDate", "Origin", "DepDelay", "WeatherDelay",
    "Origin_TempAvg", "Origin_WindSpeedKph", "Origin_WeatherCondition"
).show(200, truncate=False)


[Stage 69:=>(20 + 5) / 25][Stage 70:>   (0 + 3) / 8][Stage 71:>   (0 + 0) / 8][Stage 69:=>(22 + 3) / 25][Stage 70:==> (5 + 3) / 8][Stage 71:>   (0 + 2) / 8]

+----------+------+--------+------------+--------------+-------------------+-----------------------+
|FlightDate|Origin|DepDelay|WeatherDelay|Origin_TempAvg|Origin_WindSpeedKph|Origin_WeatherCondition|
+----------+------+--------+------------+--------------+-------------------+-----------------------+
|2024-06-05|XNA   |128.0   |39.0        |22.2          |14.1               |Clear                  |
|2024-06-25|ROC   |48.0    |7.0         |22.3          |17.6               |Clear                  |
|2024-06-04|TRI   |80.0    |64.0        |22.3          |4.3                |Clear                  |
|2024-06-17|CWA   |94.0    |94.0        |22.2          |11.6               |Clear                  |
|2024-06-30|SAV   |19.0    |19.0        |29.5          |10.9               |Clear                  |
|2024-06-03|DSM   |20.0    |15.0        |25.4          |16.9               |Clear                  |
|2024-06-05|FAR   |42.0    |42.0        |18.6          |27.7               |Windy          

