In [0]:
sas_token = "sp=rcwl&st=2025-12-10T08:37:29Z&se=2025-12-10T16:52:29Z&spr=https&sv=2024-11-04&sr=c&sig=RDrU2ZSXEN1%2Fr%2FA6OSnBVYEVfM8hwUFilfAmD7knKBU%3D"
storage_account_name = "adbprojektkakastorage"
container_name = "raw"

spark.conf.set(
    f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SAS"
)
spark.conf.set(
    f"fs.azure.sas.token.provider.type.{storage_account_name}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
)
spark.conf.set(
    f"fs.azure.sas.fixed.token.{storage_account_name}.dfs.core.windows.net",
    sas_token
)

base_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
print(f"Skonfigurowano dostęp do: {base_path}")

In [0]:
from pyspark.sql.functions import col, to_date, to_timestamp, month, dayofweek, hour, when, count, avg, round, lit, concat

df_flights = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/flights.csv")

df_airlines = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/airlines.csv")

df_airports = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/airports.csv")

df_weather = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/jfk_weather_cleaned.csv") 

print("Liczba wierszy w flights:", df_flights.count())
display(df_flights.limit(5))

In [0]:
df_jfk = df_flights.filter(col("ORIGIN_AIRPORT") == "JFK")

print("Liczba wierszy w df_jfk:", df_jfk.count())

display(df_jfk.limit(5))

In [0]:
# Sprawdzenie duplikatów 
duplicate_count = df_jfk.count() - df_jfk.dropDuplicates().count()
print(f"Liczba zduplikowanych wierszy: {duplicate_count}")

# Czy daty i godziny są wczytywane jako stringi lub liczby?
df_jfk.printSchema()

In [0]:
from pyspark.sql.functions import col, lpad, concat, to_timestamp, lit, when, substring

def clean_time_column(column_name):
    padded = lpad(col(column_name).cast("string"), 4, '0')
    return when(padded == '2400', '0000').otherwise(padded)

df_jfk_cleaned = df_jfk \
    .withColumn("CleanScheduled", clean_time_column("SCHEDULED_DEPARTURE")) \
    .withColumn("CleanDeparture", clean_time_column("DEPARTURE_TIME"))

df_jfk_timestamps = df_jfk_cleaned.withColumn(
    "ScheduledTimestamp",
    to_timestamp(
        concat(
            col("YEAR"), lit("-"), 
            lpad(col("MONTH"), 2, '0'), lit("-"), 
            lpad(col("DAY"), 2, '0'), lit(" "), 
            substring(col("CleanScheduled"), 1, 2), lit(":"), # Godziny z czystej kolumny
            substring(col("CleanScheduled"), 3, 2), lit(":00") # Minuty z czystej kolumny
        ),
        "yyyy-MM-dd HH:mm:ss"
    )
).withColumn(
    "DeparturedTimestamp",
    to_timestamp(
        concat(
            col("YEAR"), lit("-"), 
            lpad(col("MONTH"), 2, '0'), lit("-"), 
            lpad(col("DAY"), 2, '0'), lit(" "), 
            substring(col("CleanDeparture"), 1, 2), lit(":"), # Godziny z czystej kolumny
            substring(col("CleanDeparture"), 3, 2), lit(":00") # Minuty z czystej kolumny
        ),
        "yyyy-MM-dd HH:mm:ss"
    )
)

df_jfk_timestamps = df_jfk_timestamps.drop("CleanScheduled", "CleanDeparture")

print("Sprawdzenie konwersji czasu (bez błędu 24:00):")
display(df_jfk_timestamps.select(
    "YEAR", "MONTH", "DAY", 
    "SCHEDULED_DEPARTURE", "ScheduledTimestamp", 
    "DEPARTURE_TIME", "DeparturedTimestamp"
).limit(5))

In [0]:
# Analiza wartości NULL 
# Sprawdzamy, czy braki w 'DEPARTURE_DELAY' pokrywają się z odwołanymi lotami ('CANCELLED' == 1)

df_jfk_total_nulls = df_jfk_timestamps.select(
    count(when(col("DEPARTURE_DELAY").isNull(), 1)).alias("Total_Null_DepDelay"),
    count(when(col("ARRIVAL_DELAY").isNull(), 1)).alias("Total_Null_ArrDelay")
)
display(df_jfk_total_nulls)
df_integrity = df_jfk_timestamps.groupBy("CANCELLED").agg(
    count("*").alias("Total"),
    count("DEPARTURE_DELAY").alias("NonNull_DepDelay"),
    count(when(col("DEPARTURE_DELAY").isNull(), 1)).alias("Null_DepDelay"),
    count("ARRIVAL_DELAY").alias("NonNull_ArrDelay"),
    count(when(col("ARRIVAL_DELAY").isNull(), 1)).alias("Null_ArrDelay")
)
display(df_integrity)

# Sprawdzenie anomalii w opóźnieniach
# Outliers skrajnie niskie (np. -30 i mniej)
outliers_low = df_jfk_timestamps.filter(
    col("DEPARTURE_DELAY") < -30
).select(
    "AIRLINE", "ORIGIN_AIRPORT", "SCHEDULED_DEPARTURE", "DEPARTURE_TIME", "DEPARTURE_DELAY", "ScheduledTimestamp","DeparturedTimestamp"
)

# Outliers skrajnie wysokie (np. > 6h)
outliers_high = df_jfk_timestamps.filter(
    col("DEPARTURE_DELAY") > 6 * 60
).select(
    "AIRLINE", "ORIGIN_AIRPORT", "SCHEDULED_DEPARTURE", "DEPARTURE_TIME", "DEPARTURE_DELAY", "ScheduledTimestamp","DeparturedTimestamp"
)

# Połączenie
outliers = outliers_low.union(outliers_high)
display(outliers)

display(df_jfk_timestamps)

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Dla wszystkich pustych wartości w DEPARTURE_DELAY lot został odwołany.

Do zastanowienia: por que są jakieś nulle w arrival? Czy chcemy rozważać tylko te w departure, dlaczego dany samolot nie wyleciał?


In [0]:
# Statystyki opisowe dla opóźnień
display(df_jfk_timestamps.select("DEPARTURE_DELAY", "ARRIVAL_DELAY", "AIR_TIME", "DISTANCE").summary())

# Histogram opóźnień 
df_hist = df_jfk_timestamps.filter((col("DEPARTURE_DELAY") > -20) & (col("DEPARTURE_DELAY") < 200))\
                    .select("DEPARTURE_DELAY")
display(df_hist)

Databricks visualization. Run in Databricks to view.

In [0]:
df_features = df_jfk_timestamps.withColumn("IsWeekend", when(col("DAY_OF_WEEK").isin([6, 7]), 1).otherwise(0))

# Analiza: Czy w weekendy lata się gorzej?
display(df_features.groupBy("IsWeekend").agg(avg("DEPARTURE_DELAY").alias("Avg_Delay")))

Databricks visualization. Run in Databricks to view.

In [0]:
df_route = df_features.join(df_airports.withColumnRenamed("IATA_CODE", "ORIGIN_AIRPORT"), "ORIGIN_AIRPORT", "left")\
                      .select("ORIGIN_AIRPORT", "AIRPORT", "DEPARTURE_DELAY")

# Top 10 lotnisk z największymi średnimi opóźnieniami (dla lotnisk z > 1000 lotów)
top_delays = df_route.groupBy("AIRPORT")\
                     .agg(avg("DEPARTURE_DELAY").alias("Avg_Delay"), count("*").alias("Flight_Count"))\
                     .filter(col("Flight_Count") > 1000)\
                     .orderBy(col("Avg_Delay").desc())\
                     .limit(10)

display(top_delays)

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import col, to_timestamp, concat, lit, lpad, expr, hour, date_trunc

df_flights_fixed = df_flights.withColumn(
    "ScheduledString", 
    concat(
        col("YEAR"), lit("-"), 
        lpad(col("MONTH"), 2, '0'), lit("-"), 
        lpad(col("DAY"), 2, '0'), lit(" "), 
        expr("substr(lpad(cast(SCHEDULED_DEPARTURE as string), 4, '0'), 1, 2)"), lit(":"), 
        expr("substr(lpad(cast(SCHEDULED_DEPARTURE as string), 4, '0'), 3, 2)"), lit(":00")
    )
).withColumn("ScheduledTimestamp", to_timestamp(col("ScheduledString"), "yyyy-MM-dd HH:mm:ss"))\
 .withColumn("Hour", hour(col("ScheduledTimestamp"))) # Dodajemy godzinę potrzebną do łączenia

# Przygotowanie pogody (zaokrąglenie do godziny)
df_weather_hourly = df_weather.withColumn("WeatherDate", col("DATE"))\
    .withColumn("WeatherHour", hour(col("WeatherDate")))\
    .withColumn("DateOnly", to_timestamp(date_trunc("day", col("WeatherDate"))))

# Filtrowanie tylko JFK 
print("Filtrowanie lotów z JFK...")
df_jfk = df_flights_fixed.filter(col("ORIGIN_AIRPORT") == "JFK")

print("Łączenie z pogodą...")
df_joined = df_jfk.join(
    df_weather_hourly,
    (to_timestamp(date_trunc("day", df_jfk.ScheduledTimestamp)) == df_weather_hourly.DateOnly) & 
    (df_jfk.Hour == df_weather_hourly.WeatherHour),
    "left"
)

print("Statystyki opóźnień w zależności od pogody (JFK):")
display(df_joined.select("DEPARTURE_DELAY", "HOURLYVISIBILITY", "HOURLYWindSpeed", "HOURLYPrecip").summary())

In [0]:
from pyspark.sql.functions import round, avg, col

df_wind_analysis = df_joined.filter(col("DEPARTURE_DELAY").isNotNull()) \
    .groupBy(round(col("HOURLYWindSpeed")).alias("WindSpeed")) \
    .agg(avg("DEPARTURE_DELAY").alias("AvgDelay"), count("*").alias("FlightCount")) \
    .orderBy("WindSpeed")

display(df_wind_analysis)

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import round, avg, col

df_vis_analysis = df_joined.filter(col("DEPARTURE_DELAY").isNotNull()) \
    .groupBy(round(col("HOURLYVISIBILITY")).alias("Visibility")) \
    .agg(avg("DEPARTURE_DELAY").alias("AvgDelay"), count("*").alias("FlightCount")) \
    .orderBy("Visibility")

display(df_vis_analysis)

Databricks visualization. Run in Databricks to view.

In [0]:
df_time_analysis = df_joined.filter(col("DEPARTURE_DELAY").isNotNull()) \
    .groupBy("Hour") \
    .agg(avg("DEPARTURE_DELAY").alias("AvgDelay")) \
    .orderBy("Hour")

display(df_time_analysis)

Databricks visualization. Run in Databricks to view.