In [18]:
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DoubleType, TimestampType
from pyspark.sql.functions import col, to_date, concat, lit
os.environ["SPARK_HOME"] = "/home/mate/.local/lib/python3.10/site-packages/pyspark/"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"


In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("My Spark Application") \
    .getOrCreate()


In [20]:
# Example: Creating a DataFrame and showing its content
df = spark.createDataFrame([(1, 'foo'), (2, 'bar')], ["ID", "Value"])
df.show()


+---+-----+
| ID|Value|
+---+-----+
|  1|  foo|
|  2|  bar|
+---+-----+



In [21]:
from pyspark.sql.functions import col, dayofweek,to_date, month, count, avg
from pyspark.sql import Window
from pyspark.sql.functions import row_number,   sum, when


# Load the CSV file into a DataFrame
csv_file_path_flightdelay = "./full_data_flightdelay.csv"  # Replace with the path to your CSV file


df_flightdelay = spark.read.option("delimiter", ",").option("header", "true").csv(csv_file_path_flightdelay)


# Read the CSV file using the manually defined schema
csv_file_path_weather = "./airport_weather_2019.csv"  # Replace with your file path
df_weather = spark.read.option("delimiter", ",").option("header", "true").csv(csv_file_path_weather)


In [72]:
df_flightdelay.printSchema()
df_weather.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- DISTANCE_GROUP: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = 

In [76]:
print("Unique airports in weather:")
df_weather.select('NAME').distinct().count()

Unique airports in weather:


106

In [78]:
print("Unique airports in delays:")
df_flightdelay.select('DEPARTING_AIRPORT').distinct().count()


Unique airports in delays:




+--------------------+
|   DEPARTING_AIRPORT|
+--------------------+
|     Eppley Airfield|
|     Kahului Airport|
|Greater Buffalo I...|
|Sacramento Intern...|
|Chicago O'Hare In...|
|   Will Rogers World|
|Raleigh-Durham In...|
|Minneapolis-St Pa...|
|Metropolitan Oakl...|
|Southwest Florida...|
|Long Beach Daughe...|
|  Birmingham Airport|
|San Antonio Inter...|
|Cincinnati/Northe...|
|           LaGuardia|
|Savannah/Hilton H...|
|     William P Hobby|
|Philadelphia Inte...|
| Miami International|
|        McGhee Tyson|
|Anchorage Interna...|
|             Keahole|
|San Diego Interna...|
|John F. Kennedy I...|
|Theodore Francis ...|
|    Standiford Field|
|Honolulu Internat...|
|Port Columbus Int...|
|Los Angeles Inter...|
|Austin - Bergstro...|
|Newark Liberty In...|
|Dallas Fort Worth...|
|Tucson International|
|El Paso Internati...|
|Puerto Rico Inter...|
|Hollywood-Burbank...|
|       Lihue Airport|
|Kansas City Inter...|
|Orlando Internati...|
|Lambert-St. Louis...|
| Logan Int

                                                                                

In [91]:
df_flightdelay.select('DEPARTING_AIRPORT').distinct().write.csv(path='./delays_departing_airports.csv', header=True)
df_weather.select('NAME').distinct().write.csv(path='./weather_airport_name.csv', header=True)

                                                                                

In [46]:
# Cast Month column to integer
# Convert the string column to integer
df_flightdelay = df_flightdelay.withColumn("MONTH_INT", col("MONTH").cast("integer"))

# Show the modified DataFrame with the new integer column
print("DataFrame after Type Conversion:")
df_flightdelay.show(n=2)

# Optionally, you might want to replace the original column with the casted one
df_flightdelay = df_flightdelay.drop("MONTH").withColumnRenamed("MONTH_INT", "MONTH")
print("DataFrame with Replaced Column:")
df_flightdelay.show(n=2)

DataFrame after Type Conversion:


24/05/01 20:54:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
24/05/01 20:55:06 WARN CSVHeaderChecker: CSV he

+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+---------+--------------------+--------+---------+--------------------+---------------+-----+---------+
|_c0|DAY_OF_WEEK|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|PLANE_AGE|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE|    PREVIOUS_AIRPORT|normalized_name|MONTH|MONTH_INT|
+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+--



+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+---------+--------------------+--------+---------+--------------------+---------------+-----+
|_c0|DAY_OF_WEEK|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|PLANE_AGE|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE|    PREVIOUS_AIRPORT|normalized_name|MONTH|
+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+----------------------

                                                                                

In [47]:
df_flightdelay.printSchema()


root
 |-- _c0: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- DISTANCE_GROUP: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = 

In [48]:
from pyspark.sql.functions import coalesce

# create new column for month and day_of_week values derived from date
df_day_column = df_weather.withColumn("DATE_NEW", to_date(col("DATE"), "M/d/yyyy"))
df_day_column = df_day_column.withColumn("DATE_NEW", coalesce(df_day_column["DATE_NEW"], to_date(df_day_column["DATE"], 'yyyy-MM-dd')))
    
df_day_column = df_day_column.withColumn("DAY_OF_WEEK", dayofweek(col("DATE_NEW").alias("DAY_OF_WEEK")))
df_day_column = df_day_column.withColumn("MONTH", month(col("DATE_NEW").alias("MONTH")))
#df_day_column = df_weather.withColumn("DAY_OF_WEEK", dayofweek(col("DATE").alias("DAY_OF_WEEK"))) 




#df_day_column.show(n=2)
df_day_column.createOrReplaceTempView("table1")
df_select = spark.sql("SELECT STATION, NAME,DAY_OF_WEEK,DATE, MONTH, AWND, PRCP, SNOW, SNWD, TAVG, TMAX, TMIN, WDF2 from table1")
#df_select.show(n=5)

grouped_df = df_select.groupBy("MONTH", "NAME").agg(
    avg("AWND").alias("AWND"),
    avg("PRCP").alias("PRCP"),
    avg("SNOW").alias("SNOW"),
    avg("SNWD").alias("SNWD"),
    avg("TAVG").alias("TAVG"),
    avg("TMAX").alias("TMAX"),
    avg("TMIN").alias("TMIN"),
    avg("WDF2").alias("WDF2")
).orderBy("NAME","MONTH")


grouped_df.show(n=20)

+-----+--------------------+------------------+--------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+
|MONTH|                NAME|              AWND|                PRCP|               SNOW|               SNWD|              TAVG|              TMAX|              TMIN|              WDF2|
+-----+--------------------+------------------+--------------------+-------------------+-------------------+------------------+------------------+------------------+------------------+
|    1|ALBANY INTERNATIO...| 9.604516129032257| 0.13838709677419353| 0.6096774193548388| 2.5064516129032257|23.967741935483872| 31.93548387096774|14.709677419354838|231.29032258064515|
|    2|ALBANY INTERNATIO...| 8.803214285714287| 0.09571428571428572| 0.5035714285714284|              2.525|27.857142857142858|35.392857142857146|             19.75|233.57142857142858|
|    3|ALBANY INTERNATIO...| 9.698064516129032|0.045000000000000005|0.21290

In [49]:
from pyspark.sql.functions import lower, split

# Normalize joining columns
grouped_df = grouped_df.withColumn("normalized_name", lower(col("name")))
df_flightdelay = df_flightdelay.withColumn("normalized_name", lower(split(col("departing_airport"), " ").getItem(0)))

# Group by to investigate
grouped_df_nn = grouped_df.groupBy("normalized_name").agg(
    count('*').alias('count')
)

grouped_df_name = grouped_df.groupBy("NAME").agg(
    count('*').alias('count')
)

print(grouped_df_nn.count())
print(grouped_df_name.count())

# Only unique values
grouped_df = grouped_df.distinct()
df_flightdelay = df_flightdelay.distinct()


# Join dataframes grouped_df and df_flightdelay
grouped_df.show(n=2)
df_flightdelay.show(n=2)
joined_df = df_flightdelay.alias('f').join(
    grouped_df.alias('g'),
    (col('g.month') == col('f.month')) & (col('g.normalized_name')).contains(col('f.normalized_name')), 'inner'
)

#joined_df.show(n=2)

106
106
+-----+--------------------+-----------------+--------------------+--------------------+----+-----------------+-----------------+------------------+------------------+--------------------+
|MONTH|                NAME|             AWND|                PRCP|                SNOW|SNWD|             TAVG|             TMAX|              TMIN|              WDF2|     normalized_name|
+-----+--------------------+-----------------+--------------------+--------------------+----+-----------------+-----------------+------------------+------------------+--------------------+
|    9|COLORADO SPRINGS ...|9.975999999999999|0.010666666666666668|                 0.0| 0.0|68.53333333333333|84.03333333333333|53.333333333333336|190.33333333333334|colorado springs ...|
|   11|SPRINGFIELD WEATH...|9.499333333333334| 0.10966666666666666|0.013333333333333334| 0.0|             42.6|             55.0|31.766666666666666|             223.0|springfield weath...|
+-----+--------------------+-----------------+-

24/05/01 20:55:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
[Stage 180:>                                   

+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+---------+--------------------+--------+---------+--------------------+---------------+-----+
|_c0|DAY_OF_WEEK|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|PLANE_AGE|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE|    PREVIOUS_AIRPORT|normalized_name|MONTH|
+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+----------------------

                                                                                

In [50]:
print(joined_df.count())
print(df_flightdelay.count())

24/05/01 20:56:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
24/05/01 20:57:57 WARN CSVHeaderChecker: CSV he

12337982




6489062


                                                                                

In [51]:


fractions = {label: 0.1 for label in joined_df.select("DEP_DEL15").distinct().rdd.flatMap(lambda x: x).collect()}
sampled_df = joined_df.stat.sampleBy("DEP_DEL15", fractions, seed=1234)

# Show the sampled data distribution
sampled_df.groupBy("DEP_DEL15").count().show()

# Split the DataFrame into training (60%) and test (40%) sets
train_df, test_df = sampled_df.randomSplit([0.6, 0.4], seed=1234)

# Show the size of each set
print("Training Dataset Count: " + str(train_df.count()))
print("Testing Dataset Count: " + str(test_df.count()))


24/05/01 20:58:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
24/05/01 20:59:59 WARN CSVHeaderChecker: CSV he

+---------+-------+
|DEP_DEL15|  count|
+---------+-------+
|        0|1016318|
|        1| 216975|
+---------+-------+



24/05/01 21:02:50 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv


Training Dataset Count: 739242




Testing Dataset Count: 494051


                                                                                

In [52]:
# Numerical to nominal
# Calculate the quantile thresholds
thresholds = joined_df.approxQuantile("PRCP", [0.33, 0.67], 0.01)  # 0.01 is the relative error

# Categorize based on quantile thresholds
joined_df = joined_df.withColumn(
    "precip_category",
    when(col("PRCP") <= thresholds[0], "low")
    .when(col("PRCP") <= thresholds[1], "medium")
    .otherwise("high")
)

# Show the resulting DataFrame
joined_df.select("PRCP", "precip_category").show()

24/05/01 21:04:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
24/05/01 21:05:40 WARN CSVHeaderChecker: CSV he

+-------------------+---------------+
|               PRCP|precip_category|
+-------------------+---------------+
|0.03354838709677419|            low|
|0.03354838709677419|            low|
| 0.1129032258064516|         medium|
| 0.1129032258064516|         medium|
|0.05741935483870968|         medium|
| 0.1235483870967742|         medium|
| 0.1235483870967742|         medium|
| 0.1235483870967742|         medium|
|0.06322580645161291|         medium|
|0.06322580645161291|         medium|
|0.06451612903225806|         medium|
|0.06451612903225806|         medium|
|0.06451612903225806|         medium|
|0.06451612903225806|         medium|
|0.06451612903225806|         medium|
|0.13580645161290322|           high|
|0.13580645161290322|           high|
|0.13580645161290322|           high|
|0.13580645161290322|           high|
| 0.0932258064516129|         medium|
+-------------------+---------------+
only showing top 20 rows



                                                                                

In [53]:
joined_df.printSchema()
joined_df.show(n=2)

24/05/01 21:06:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv


root
 |-- _c0: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- DISTANCE_GROUP: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = 

[Stage 250:>                                                        (0 + 1) / 1]

+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+---------+--------------------+--------+---------+--------------------+---------------+-----+-----+--------------------+-----------------+-------------------+----+----+------------------+------------------+-----------------+------------------+--------------------+---------------+
|_c0|DAY_OF_WEEK|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|PLANE_AGE|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE|    PREVIOUS_AIRPORT|normalized_name|MONTH|MONTH|                NAME|         

                                                                                

In [62]:
# Cut off one MONTH column
# First, see all columns including duplicates
print("Columns before operation:", joined_df.columns)

# Select columns. If 'age' appears twice, only the first occurrence is kept in the output
selected_columns = [col for col in joined_df.columns if col != 'MONTH' and col != 'normalized_name'] + ['f.MONTH'] + ['f.normalized_name']

joined_df = joined_df.select(selected_columns)

# Show the modified DataFrame
print("Columns after operation:", joined_df.columns)
joined_df.show()

Columns before operation: ['_c0', 'DAY_OF_WEEK', 'DEP_DEL15', 'DEP_TIME_BLK', 'DISTANCE_GROUP', 'SEGMENT_NUMBER', 'CONCURRENT_FLIGHTS', 'NUMBER_OF_SEATS', 'CARRIER_NAME', 'AIRPORT_FLIGHTS_MONTH', 'AIRLINE_FLIGHTS_MONTH', 'AIRLINE_AIRPORT_FLIGHTS_MONTH', 'AVG_MONTHLY_PASS_AIRPORT', 'AVG_MONTHLY_PASS_AIRLINE', 'FLT_ATTENDANTS_PER_PASS', 'GROUND_SERV_PER_PASS', 'PLANE_AGE', 'DEPARTING_AIRPORT', 'LATITUDE', 'LONGITUDE', 'PREVIOUS_AIRPORT', 'normalized_name', 'MONTH', 'MONTH', 'NAME', 'AWND', 'PRCP', 'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN', 'WDF2', 'normalized_name', 'precip_category']
Columns after operation: ['_c0', 'DAY_OF_WEEK', 'DEP_DEL15', 'DEP_TIME_BLK', 'DISTANCE_GROUP', 'SEGMENT_NUMBER', 'CONCURRENT_FLIGHTS', 'NUMBER_OF_SEATS', 'CARRIER_NAME', 'AIRPORT_FLIGHTS_MONTH', 'AIRLINE_FLIGHTS_MONTH', 'AIRLINE_AIRPORT_FLIGHTS_MONTH', 'AVG_MONTHLY_PASS_AIRPORT', 'AVG_MONTHLY_PASS_AIRLINE', 'FLT_ATTENDANTS_PER_PASS', 'GROUND_SERV_PER_PASS', 'PLANE_AGE', 'DEPARTING_AIRPORT', 'LATITUDE', 'LONGIT

24/05/01 21:14:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
[Stage 256:>                                   

+----+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+---------+--------------------+--------+---------+--------------------+--------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------+-----+---------------+
| _c0|DAY_OF_WEEK|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|PLANE_AGE|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE|    PREVIOUS_AIRPORT|                NAME|              AWND|             

                                                                                

In [67]:
joined_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- DISTANCE_GROUP: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = 

In [70]:
# Transform month from numerical to nominal
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
# Month mapping dictionary
month_dict = {
    '1': 'January', '2': 'February', '3': 'March', '4': 'April', 
    '5': 'May', '6': 'June', '7': 'July', '8': 'August', 
    '9': 'September', '10': 'October', '11': 'November', '12': 'December'
}

# Define the UDF to convert numerical months to names
def convert_month_to_name(month):
    return month_dict.get(str(month), "Unknown")
#print("DataFrame Columns:", joined_df.columns)
#joined_df.show(5)
convert_month_udf = udf(convert_month_to_name, StringType())

# Apply the UDF to create a new column with month names
df_with_months = joined_df.withColumn("MONTH_NAME", convert_month_udf(joined_df["MONTH"]))



24/05/01 21:48:51 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
[Stage 274:>                                   

+----+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+---------+--------------------+--------+---------+--------------------+--------------------+------------------+-------------------+----+------------------+------------------+------------------+------------------+------------------+---------------+-----+---------------+----------+
| _c0|DAY_OF_WEEK|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|PLANE_AGE|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE|    PREVIOUS_AIRPORT|                NAME|              AWND|               P

                                                                                

In [80]:
#df_with_months.show(n=5)
df_with_months.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- DISTANCE_GROUP: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = 

In [83]:
# Transform weekday from numerical to nominal

# Weekday mapping dictionary
month_dict = {
    '1': 'Monday', '2': 'Tuesday', '3': 'Wednesday', '4': 'Thursday', 
    '5': 'Friday', '6': 'Saturday', '7': 'Sunday'}

# Define the UDF to convert numerical months to names
def convert_weekday_to_name(weekday):
    return month_dict.get(str(weekday), "Unknown")
#print("DataFrame Columns:", joined_df.columns)
#joined_df.show(5)
convert_weekday_udf = udf(convert_weekday_to_name, StringType())

# Apply the UDF to create a new column with month names
df_with_months = df_with_months.withColumn("DAY_OF_WEEK_NAME", convert_weekday_udf(joined_df["DAY_OF_WEEK"]))
#df_with_months.show(n=1)
df_with_months = df_with_months.drop("DAY_OF_WEEK").withColumnRenamed("DAY_OF_WEEK_NAME", "DAY_OF_WEEK")


24/05/01 22:43:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
[Stage 349:>                                   

+---+-----------+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+---------+--------------------+--------+---------+-----------------+--------------------+-----------------+-------------------+----+----+------------------+------------------+-----------------+------------------+---------------+-----+---------------+----------+----------------+
|_c0|DAY_OF_WEEK|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|PLANE_AGE|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE| PREVIOUS_AIRPORT|                NAME|             AWND|               PRCP|SNOW

                                                                                

In [85]:
df_with_months.printSchema()
df_with_months.select('DAY_OF_WEEK').show(n=1)

root
 |-- _c0: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- DISTANCE_GROUP: string (nullable = true)
 |-- SEGMENT_NUMBER: string (nullable = true)
 |-- CONCURRENT_FLIGHTS: string (nullable = true)
 |-- NUMBER_OF_SEATS: string (nullable = true)
 |-- CARRIER_NAME: string (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_FLIGHTS_MONTH: string (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: string (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: string (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: string (nullable = true)
 |-- GROUND_SERV_PER_PASS: string (nullable = true)
 |-- PLANE_AGE: string (nullable = true)
 |-- DEPARTING_AIRPORT: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- PREVIOUS_AIRPORT: string (nullable = true)
 |-- NAME: string (nullable = true)
 

24/05/01 22:45:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
[Stage 355:>                                   

+-----------+
|DAY_OF_WEEK|
+-----------+
|     Sunday|
+-----------+
only showing top 1 row



                                                                                

In [86]:
df_with_months.select("DISTANCE_GROUP").distinct().count()

24/05/01 22:46:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
                                               

12

In [87]:
# Plane age into nominal

# Categorize based on research
df_with_months = df_with_months.withColumn(
    "PLANE_AGE_NOM",
    when(col("PLANE_AGE") <= 10, "New")
    .when(col("PLANE_AGE") <= 20, "Standard")
    .otherwise("Old")
)

# Replace PLANE_AGE column with the nominal one
df_with_months = df_with_months.drop("PLANE_AGE").withColumnRenamed("PLANE_AGE_NOM", "PLANE_AGE")
df_with_months.show(n=2)

In [89]:
df_with_months.select('PLANE_AGE', "PLANE_AGE_NOM").distinct().show(n=20)

24/05/01 22:56:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv

+---------+-------------+
|PLANE_AGE|PLANE_AGE_NOM|
+---------+-------------+
|        1|          New|
|       29|          Old|
|        6|          New|
|        3|          New|
|       24|          Old|
|       31|          Old|
|       30|          Old|
|        0|          New|
|       27|          Old|
|        4|          New|
|       13|     Standard|
|       10|          New|
|       21|          Old|
|       20|     Standard|
|       17|     Standard|
|       15|     Standard|
|        9|          New|
|       16|     Standard|
|       11|     Standard|
|       19|     Standard|
+---------+-------------+
only showing top 20 rows



                                                                                

24/05/01 23:03:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
 Schema: _c0, MONTH, DAY_OF_WEEK, DEP_DEL15, DEP_TIME_BLK, DISTANCE_GROUP, SEGMENT_NUMBER, CONCURRENT_FLIGHTS, NUMBER_OF_SEATS, CARRIER_NAME, AIRPORT_FLIGHTS_MONTH, AIRLINE_FLIGHTS_MONTH, AIRLINE_AIRPORT_FLIGHTS_MONTH, AVG_MONTHLY_PASS_AIRPORT, AVG_MONTHLY_PASS_AIRLINE, FLT_ATTENDANTS_PER_PASS, GROUND_SERV_PER_PASS, PLANE_AGE, DEPARTING_AIRPORT, LATITUDE, LONGITUDE, PREVIOUS_AIRPORT
Expected: _c0 but found: 
CSV file: file:///home/mate/repos/LuckaEvolucne/full_data_flightdelay.csv
                                               

+---+---------+------------+--------------+--------------+------------------+---------------+--------------------+---------------------+---------------------+-----------------------------+------------------------+------------------------+-----------------------+--------------------+--------------------+--------+---------+--------------------+--------------------+-----------------+-------------------+----+----+------------------+------------------+-----------------+------------------+---------------+-----+---------------+----------+-----------+---------+
|_c0|DEP_DEL15|DEP_TIME_BLK|DISTANCE_GROUP|SEGMENT_NUMBER|CONCURRENT_FLIGHTS|NUMBER_OF_SEATS|        CARRIER_NAME|AIRPORT_FLIGHTS_MONTH|AIRLINE_FLIGHTS_MONTH|AIRLINE_AIRPORT_FLIGHTS_MONTH|AVG_MONTHLY_PASS_AIRPORT|AVG_MONTHLY_PASS_AIRLINE|FLT_ATTENDANTS_PER_PASS|GROUND_SERV_PER_PASS|   DEPARTING_AIRPORT|LATITUDE|LONGITUDE|    PREVIOUS_AIRPORT|                NAME|             AWND|               PRCP|SNOW|SNWD|              TAVG|        

                                                                                