In [0]:
sas_token = "sp=rcwl&st=2026-01-02T23:25:54Z&se=2026-01-28T07:40:54Z&spr=https&sv=2024-11-04&sr=c&sig=T2m3SUYqJPnkxFmRzrnCeKviOcAidCnff%2B58bN5E1XE%3D"
storage_account_name = "newadbprojektkakastorage"
container_name = "data"

spark.conf.set(
    f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SAS"
)
spark.conf.set(
    f"fs.azure.sas.token.provider.type.{storage_account_name}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
)
spark.conf.set(
    f"fs.azure.sas.fixed.token.{storage_account_name}.dfs.core.windows.net",
    sas_token
)

base_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
print(f"Skonfigurowano dostęp do: {base_path}")

Skonfigurowano dostęp do: abfss://data@newadbprojektkakastorage.dfs.core.windows.net


In [0]:
df_model = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(f"{base_path}/final_results/df_weather_jfk.csv")

display(df_model.limit(5))

DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DEPARTURE_DELAY,DISTANCE,CANCELLED,ORIGIN_AIRPORT,WEATHER_DELAY,ScheduledTimestamp,DeparturedTimestamp,HOURLYPrecip,HOURLYVISIBILITY,HOURLYWindSpeed
1,DL,2338,-3,1069,0,JFK,,2015-01-05T16:00:00Z,2015-01-05T15:57:00Z,0.0,10.0,15.0
1,AA,67,9,1598,0,JFK,,2015-01-05T16:05:00Z,2015-01-05T16:14:00Z,0.0,10.0,15.0
1,AA,65,5,1391,0,JFK,,2015-01-05T16:15:00Z,2015-01-05T16:20:00Z,0.0,10.0,15.0
1,VX,27,2,2586,0,JFK,,2015-01-05T16:30:00Z,2015-01-05T16:32:00Z,0.0,10.0,15.0
1,AA,133,0,2475,0,JFK,,2015-01-05T16:30:00Z,2015-01-05T16:30:00Z,0.0,10.0,15.0


In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

#usuwamy kolumny czasowe/identyfikatory, zostawiamy parametry lotu i pogody
feature_cols = ["DISTANCE", "HOURLYVISIBILITY", "HOURLYWindSpeed", "DAY_OF_WEEK"]

#linie lotnicze (tekst na liczby)
airline_indexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_INDEX", handleInvalid="skip")

#składanie cech w jeden wektor
assembler = VectorAssembler(inputCols=feature_cols + ["AIRLINE_INDEX"], outputCol="raw_features", handleInvalid="skip")

#skalowanie 
scaler = StandardScaler(inputCol="raw_features", outputCol="features")

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

train_data, test_data = df_model.randomSplit([0.8, 0.2], seed=123)

rf_class = RandomForestClassifier(labelCol="CANCELLED", featuresCol="features", numTrees=50)

#pipeline dla klasyfikacji
pipeline_class = Pipeline(stages=[airline_indexer, assembler, scaler, rf_class])
model_cancel = pipeline_class.fit(train_data)

#predykcja
predictions_cancel = model_cancel.transform(test_data)

#AUC - im bliżej 1, tym lepiej
evaluator_auc = BinaryClassificationEvaluator(labelCol="CANCELLED", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions_cancel)

print(f"Skuteczność modelu (AUC) dla odwołań: {auc}")
display(predictions_cancel.select("CANCELLED", "prediction", "probability"))

Skuteczność modelu (AUC) dla odwołań: 0.9946198830409356


CANCELLED,prediction,probability
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.999842475044368, 1.575249556320187E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.999842475044368, 1.575249556320187E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.999842475044368, 1.575249556320187E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9998425696632295, 1.5743033677053945E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9995425958982176, 4.574041017824057E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9995425958982176, 4.574041017824057E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.999842475044368, 1.575249556320187E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.999842475044368, 1.575249556320187E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.999842475044368, 1.575249556320187E-4))"
0,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9995425958982176, 4.574041017824057E-4))"


In [0]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

df_reg = df_model.filter(col("CANCELLED") == 0)
train_reg, test_reg = df_reg.randomSplit([0.8, 0.2], seed=123)

gbt_reg = GBTRegressor(labelCol="DEPARTURE_DELAY", featuresCol="features", maxIter=20)

#pipeline dla regresji
pipeline_reg = Pipeline(stages=[airline_indexer, assembler, scaler, gbt_reg])
model_delay = pipeline_reg.fit(train_reg)

#predykcja
predictions_delay = model_delay.transform(test_reg)

#RMSE
evaluator_rmse = RegressionEvaluator(labelCol="DEPARTURE_DELAY", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions_delay)

print(f"Średni błąd przewidywania (RMSE): {rmse} minut")
display(predictions_delay.select("DEPARTURE_DELAY", "prediction"))

Średni błąd przewidywania (RMSE): 5.195487208727045 minut


DEPARTURE_DELAY,prediction
-6,-2.8135583020444463
-6,-2.891414167750666
-5,-2.8135583020444463
5,-2.52226078882262
-7,-3.6593851929929273
-6,-2.864223686678583
-3,-2.8135583020444463
-6,-2.795178608323851
-6,-2.8135583020444463
-3,-2.864223686678583


In [0]:
display(predictions_cancel.filter((col("prediction") == 1) | (col("CANCELLED") == 1))
        .select("CANCELLED", "prediction", "probability"))

CANCELLED,prediction,probability
1,0.0,"Map(vectorType -> dense, length -> 2, values -> List(0.9979509447254309, 0.002049055274569105))"
