In [0]:
import os
with open('tokens_and_api.txt', 'r') as file:
    exec(file.read())

#sas_token = tokens in tokens.txt
storage_account_name = "newadbprojektkakastorage"
container_name = "data"

spark.conf.set(
    f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SAS"
)
spark.conf.set(
    f"fs.azure.sas.token.provider.type.{storage_account_name}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
)
spark.conf.set(
    f"fs.azure.sas.fixed.token.{storage_account_name}.dfs.core.windows.net",
    sas_token
)

base_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
print(f"Skonfigurowano dostęp do: {base_path}")

Skonfigurowano dostęp do: abfss://data@newadbprojektkakastorage.dfs.core.windows.net


In [0]:
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.functions import col, to_timestamp, hour, month, when, dayofweek

df_raw = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{base_path}/final_results/df_weather_jfk_with_upper_outliers.csv")

display(df_raw.limit(5))

DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DEPARTURE_DELAY,DISTANCE,CANCELLED,ORIGIN_AIRPORT,WEATHER_DELAY,ScheduledTimestamp,DeparturedTimestamp,HOURLYPrecip,HOURLYVISIBILITY,HOURLYWindSpeed
1,DL,2338,-3,1069,0,JFK,,2015-01-05T16:00:00Z,2015-01-05T15:57:00Z,0.0,10.0,15.0
1,AA,67,9,1598,0,JFK,,2015-01-05T16:05:00Z,2015-01-05T16:14:00Z,0.0,10.0,15.0
1,AA,65,5,1391,0,JFK,,2015-01-05T16:15:00Z,2015-01-05T16:20:00Z,0.0,10.0,15.0
1,VX,27,2,2586,0,JFK,,2015-01-05T16:30:00Z,2015-01-05T16:32:00Z,0.0,10.0,15.0
1,AA,133,0,2475,0,JFK,,2015-01-05T16:30:00Z,2015-01-05T16:30:00Z,0.0,10.0,15.0


In [0]:
df_base = df_raw.filter((col("DEPARTURE_DELAY") >= 0) & (col("CANCELLED") == 0))

df_prepared = df_base.withColumn("IS_DELAYED", when(col("DEPARTURE_DELAY") > 15, 1).otherwise(0)) \
                     .withColumn('ScheduledTimestamp', to_timestamp('ScheduledTimestamp')) \
                     .withColumn('HOUR', hour(col('ScheduledTimestamp'))) \
                     .withColumn('MONTH', month(col('ScheduledTimestamp'))) \
                     .withColumn('DAY_OF_WEEK', dayofweek(col('ScheduledTimestamp')))

cols_to_remove = ['FLIGHT_NUMBER', 'CANCELLED', 'WEATHER_DELAY', 'DeparturedTimestamp', 'ORIGIN_AIRPORT', 'ScheduledTimestamp']
df_final = df_prepared.drop(*cols_to_remove)

feature_cols = ["DISTANCE", "HOURLYVISIBILITY", "HOURLYWindSpeed", "DAY_OF_WEEK", "HOUR", "MONTH"]

display(df_final.limit(5))

DAY_OF_WEEK,AIRLINE,DEPARTURE_DELAY,DISTANCE,HOURLYPrecip,HOURLYVISIBILITY,HOURLYWindSpeed,IS_DELAYED,HOUR,MONTH
2,AA,9,1598,0.0,10.0,15.0,0,16,1
2,AA,5,1391,0.0,10.0,15.0,0,16,1
2,VX,2,2586,0.0,10.0,15.0,0,16,1
2,AA,0,2475,0.0,10.0,15.0,0,16,1
2,B6,0,187,0.0,10.0,15.0,0,16,1


In [0]:
from pyspark.sql.functions import log1p, expm1
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

df_reg_final = df_final.filter(col("DEPARTURE_DELAY") > 0) \
                       .withColumn("LABEL_LOG", log1p(col("DEPARTURE_DELAY")))

train_reg, test_reg = df_reg_final.randomSplit([0.8, 0.2], seed=123)

indexer_reg = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_INDEX", handleInvalid="keep")
encoder_reg = OneHotEncoder(inputCol="AIRLINE_INDEX", outputCol="AIRLINE_VEC")
assembler_reg = VectorAssembler(inputCols=feature_cols + ["AIRLINE_VEC"], outputCol="raw_features")
scaler_reg = StandardScaler(inputCol="raw_features", outputCol="features")
gbt = GBTRegressor(labelCol="LABEL_LOG", featuresCol="features", maxIter=30)

pipeline_reg = Pipeline(stages=[indexer_reg, encoder_reg, assembler_reg, scaler_reg, gbt])
model_reg = pipeline_reg.fit(train_reg) 
predictions_reg = model_reg.transform(test_reg)

predictions_reg = predictions_reg.withColumn("prediction_minutes", expm1(col("prediction")))

eval_r2 = RegressionEvaluator(labelCol="DEPARTURE_DELAY", predictionCol="prediction_minutes", metricName="r2")
eval_rmse = RegressionEvaluator(labelCol="DEPARTURE_DELAY", predictionCol="prediction_minutes", metricName="rmse")
eval_mae = RegressionEvaluator(labelCol="DEPARTURE_DELAY", predictionCol="prediction_minutes", metricName="mae")

print(f"R2: {eval_r2.evaluate(predictions_reg):.4f}")
print(f"RMSE: {eval_rmse.evaluate(predictions_reg):.2f} min")
print(f"MAE:  {eval_mae.evaluate(predictions_reg):.2f} min")

R2: -0.1064
RMSE: 4.89 min
MAE:  3.89 min


In [0]:
def print_importances(model_obj, name):
    print(f"\nRanking cech dla: {name}")
    #ostatni stage to model
    importances = model_obj.stages[-1].featureImportances
    #OneHotEncoder rozbija AIRLINE na wiele cech binarnych, dla uproszczenia w raporcie sumujemy je jako wpływ linii lotniczej.
    feat_names = feature_cols + ["AIRLINE_VEC_DUMMIES"] 
    for imp, n in sorted(zip(importances, feat_names), reverse=True):
        print(f"{n}: {imp:.4f}")

print_importances(model_reg, "REGRESJA (GBT)")


Ranking cech dla: REGRESJA (GBT)
DISTANCE: 0.2062
HOURLYWindSpeed: 0.2058
HOUR: 0.1630
MONTH: 0.1170
DAY_OF_WEEK: 0.1091
HOURLYVISIBILITY: 0.0919
AIRLINE_VEC_DUMMIES: 0.0243


In [0]:
df_reg_final_nolog = df_final.filter(col("DEPARTURE_DELAY") > 0)

train_reg, test_reg = df_reg_final_nolog.randomSplit([0.8, 0.2], seed=123)

gbt_nolog = GBTRegressor(labelCol="DEPARTURE_DELAY", featuresCol="features", maxIter=30)

pipeline_reg_nolog = Pipeline(stages=[indexer_reg, encoder_reg, assembler_reg, scaler_reg, gbt_nolog])
model_reg_nolog = pipeline_reg_nolog.fit(train_reg) 
predictions_reg_nolog = model_reg_nolog.transform(test_reg)

eval_r2 = RegressionEvaluator(labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="r2")
eval_rmse = RegressionEvaluator(labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse")
eval_mae = RegressionEvaluator(labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="mae")

print(f"R2: {eval_r2.evaluate(predictions_reg_nolog):.4f}")
print(f"RMSE: {eval_rmse.evaluate(predictions_reg_nolog):.2f} min")
print(f"MAE:  {eval_mae.evaluate(predictions_reg_nolog):.2f} min")

R2: 0.0132
RMSE: 4.62 min
MAE:  3.88 min


In [0]:
print_importances(model_reg_nolog, "REGRESJA (GBT) bez log")


Ranking cech dla: REGRESJA (GBT) bez log
HOURLYWindSpeed: 0.2068
DISTANCE: 0.1730
HOUR: 0.1652
MONTH: 0.1312
DAY_OF_WEEK: 0.1111
HOURLYVISIBILITY: 0.1021
AIRLINE_VEC_DUMMIES: 0.0142
