In [0]:
sas_token = "sp=racwlme&st=2025-12-17T14:34:50Z&se=2025-12-17T22:49:50Z&spr=https&sv=2024-11-04&sr=c&sig=5PU%2Bg4Lfym4ksKJrl1ffOMu337pFbHAEvjPOyIhGNDU%3D"
storage_account_name = "newadbprojektkakastorage"
container_name = "data"

spark.conf.set(
    f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SAS"
)
spark.conf.set(
    f"fs.azure.sas.token.provider.type.{storage_account_name}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
)
spark.conf.set(
    f"fs.azure.sas.fixed.token.{storage_account_name}.dfs.core.windows.net",
    sas_token
)

base_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
print(f"Skonfigurowano dostęp do: {base_path}")

In [0]:
from pyspark.sql.functions import col, to_date, to_timestamp, month, dayofweek, hour, when, count, avg, round, lit, concat

df_flights = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/flights.csv")

# df_airlines = spark.read.format("csv")\
#     .option("header", "true")\
#     .option("inferSchema", "true")\
#     .load(f"{base_path}/airlines.csv")

# df_airports = spark.read.format("csv")\
#     .option("header", "true")\
#     .option("inferSchema", "true")\
#     .load(f"{base_path}/airports.csv")

df_weather = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/jfk_weather_cleaned.csv") 

print("Liczba wierszy w flights:", df_flights.count())
display(df_flights.limit(5))

In [0]:
df_jfk = df_flights.filter(col("ORIGIN_AIRPORT") == "JFK").select("YEAR","MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","SCHEDULED_DEPARTURE","DEPARTURE_TIME","DEPARTURE_DELAY","DISTANCE","CANCELLED")

print("Liczba wierszy w df_jfk:", df_jfk.count())
display(df_jfk.limit(5))

In [0]:
# Sprawdzenie duplikatów 
duplicate_count = df_jfk.count() - df_jfk.dropDuplicates().count()
print(f"Liczba zduplikowanych wierszy: {duplicate_count}")

# Czy daty i godziny są wczytywane jako stringi lub liczby?
df_jfk.printSchema()

In [0]:
from pyspark.sql.functions import col, lpad, concat, to_timestamp, lit, when, substring

def clean_time_column(column_name):
    padded = lpad(col(column_name).cast("string"), 4, '0')
    return when(padded == '2400', '0000').otherwise(padded)

df_jfk_cleaned = df_jfk \
    .withColumn("CleanScheduled", clean_time_column("SCHEDULED_DEPARTURE")) \
    .withColumn("CleanDeparture", clean_time_column("DEPARTURE_TIME"))

df_jfk_timestamps = df_jfk_cleaned.withColumn(
    "ScheduledTimestamp",
    to_timestamp(
        concat(
            col("YEAR"), lit("-"), 
            lpad(col("MONTH"), 2, '0'), lit("-"), 
            lpad(col("DAY"), 2, '0'), lit(" "), 
            substring(col("CleanScheduled"), 1, 2), lit(":"),
            substring(col("CleanScheduled"), 3, 2), lit(":00") 
        ),
        "yyyy-MM-dd HH:mm:ss"
    )
).withColumn(
    "DeparturedTimestamp",
    to_timestamp(
        concat(
            col("YEAR"), lit("-"), 
            lpad(col("MONTH"), 2, '0'), lit("-"), 
            lpad(col("DAY"), 2, '0'), lit(" "), 
            substring(col("CleanDeparture"), 1, 2), lit(":"), 
            substring(col("CleanDeparture"), 3, 2), lit(":00") 
        ),
        "yyyy-MM-dd HH:mm:ss"
    )
)

df_jfk_timestamps_clean = df_jfk_timestamps.drop("CleanScheduled", "CleanDeparture", "YEAR", "MONTH", "DAY", 
    "SCHEDULED_DEPARTURE", "DEPARTURE_TIME")

print("Sprawdzenie konwersji czasu (bez błędu 24:00):")
display(df_jfk_timestamps.select(
    "YEAR", "MONTH", "DAY", 
    "SCHEDULED_DEPARTURE", "ScheduledTimestamp", 
    "DEPARTURE_TIME", "DeparturedTimestamp"
))

In [0]:
from pyspark.sql.functions import col, count, when

exprs = [count(when(col(c).isNull(), c)).alias(c) for c in df_jfk_timestamps.columns]
df_null_counts = df_jfk_timestamps.select(*exprs)

row = df_null_counts.first()

cols_with_nulls = [c for c in df_null_counts.columns if row[c] > 0]

if cols_with_nulls:
    display(df_null_counts.select(*cols_with_nulls))
else:
    print("Brak nulli.")

In [0]:
# Analiza wartości NULL 
# Sprawdzamy, czy braki w 'DEPARTURE_DELAY' pokrywają się z odwołanymi lotami ('CANCELLED' == 1)

df_jfk_total_nulls = df_jfk_timestamps.select(
    count(when(col("DEPARTURE_DELAY").isNull(), 1)).alias("Total_Null_DepDelay")
)
display(df_jfk_total_nulls)
df_integrity = df_jfk_timestamps.groupBy("CANCELLED").agg(
    count("*").alias("Total"),
    count("DEPARTURE_DELAY").alias("NonNull_DepDelay"),
    count(when(col("DEPARTURE_DELAY").isNull(), 1)).alias("Null_DepDelay")
)
display(df_integrity)


Dla wszystkich pustych wartości w DEPARTURE_DELAY lot został odwołany.

Do zastanowienia: por que są jakieś nulle w arrival? Czy chcemy rozważać tylko te w departure, dlaczego dany samolot nie wyleciał?


In [0]:
df_jfk_no_nulls = df_jfk_timestamps.filter(col("DEPARTURE_DELAY").isNotNull())
display(df_jfk_no_nulls)
print(f"Liczba wierszy: {df_jfk_no_nulls.count()}")

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
# Sprawdzenie anomalii w opóźnieniach
# Outliers skrajnie niskie (np. -30 i mniej)
outliers_low = df_jfk_timestamps.filter(
    col("DEPARTURE_DELAY") < -30
).select(
    "AIRLINE", "ORIGIN_AIRPORT", "SCHEDULED_DEPARTURE", "DEPARTURE_TIME", "DEPARTURE_DELAY", "ScheduledTimestamp","DeparturedTimestamp"
)

# Outliers skrajnie wysokie (np. > 6h)
outliers_high = df_jfk_timestamps.filter(
    col("DEPARTURE_DELAY") > 6 * 60
).select(
    "AIRLINE", "ORIGIN_AIRPORT", "SCHEDULED_DEPARTURE", "DEPARTURE_TIME", "DEPARTURE_DELAY", "ScheduledTimestamp","DeparturedTimestamp"
)

# Połączenie
outliers = outliers_low.union(outliers_high)
display(outliers)

display(df_jfk_timestamps)

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
# Statystyki opisowe dla opóźnień
display(df_jfk_no_nulls.select("DEPARTURE_DELAY", "ARRIVAL_DELAY", "AIR_TIME", "DISTANCE").summary())

In [0]:
# Obliczamy granice IQR
IQR = df_jfk_timestamps_clean.approxQuantile("DEPARTURE_DELAY", [0.05, 0.90], 0.01)
print(f"Granice IQR: {IQR[0]} - {IQR[1]}")
lower_bound = IQR[0]
upper_bound = IQR[1]

df_jfk_no_outliers = df_jfk_timestamps_clean.filter(
    (col("DEPARTURE_DELAY") >= lower_bound) & 
    (col("DEPARTURE_DELAY") <= upper_bound)
)

print(f"Liczba wierszy przed: {df_jfk_timestamps_clean.count()}")
print(f"Liczba wierszy po: {df_jfk_timestamps_clean.count()}")   


In [0]:
# Statystyki opisowe dla opóźnień
display(df_jfk_no_outliers.select("DEPARTURE_DELAY").summary())

In [0]:
df_features = df_jfk_no_outliers.withColumn("IsWeekend", when(col("DAY_OF_WEEK").isin([6, 7]), 1).otherwise(0))

# Analiza: Czy w weekendy lata się gorzej?
display(df_features.groupBy("IsWeekend").agg(avg("DEPARTURE_DELAY").alias("Avg_Delay")))

df_weekly_analysis = df_features.groupBy("DAY_OF_WEEK") \
    .agg(avg("DEPARTURE_DELAY").alias("Avg_Delay")) \
    .orderBy("DAY_OF_WEEK")

display(df_weekly_analysis)

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import col, count, desc,year 

print("Typy danych (Schema)")
df_weather.printSchema()

df_weather_2015 = df_weather.filter(year(col("DATE")) == 2015)

print("Próbka danych")
display(df_weather_2015)


from pyspark.sql.functions import count, when

exprs_null = [count(when(col(c).isNull(), c)).alias(c) for c in df_weather_2015.columns]


row_nulls = df_weather_2015.select(*exprs_null).first().asDict()
null_data = [(k, v) for k, v in row_nulls.items() if v > 0] 

print("Kolumny zawierające NULL-e")
if null_data:
    df_null_report = spark.createDataFrame(null_data, ["Kolumna", "Liczba_Nulli"])
    display(df_null_report.orderBy(col("Liczba_Nulli").desc()))
else:
    print("Brak wartości NULL (uwaga: puste stringi '' nie są liczone jako NULL!)")

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import col, to_timestamp, concat, lit, lpad, expr, hour, date_trunc

df_flights_fixed = df_jfk_no_outliers.withColumn(
    "ScheduledString", 
    concat(
        col("YEAR"), lit("-"), 
        lpad(col("MONTH"), 2, '0'), lit("-"), 
        lpad(col("DAY"), 2, '0'), lit(" "), 
        expr("substr(lpad(cast(SCHEDULED_DEPARTURE as string), 4, '0'), 1, 2)"), lit(":"), 
        expr("substr(lpad(cast(SCHEDULED_DEPARTURE as string), 4, '0'), 3, 2)"), lit(":00")
    )
).withColumn("ScheduledTimestamp", to_timestamp(col("ScheduledString"), "yyyy-MM-dd HH:mm:ss"))\
 .withColumn("Hour", hour(col("ScheduledTimestamp"))) 

# Przygotowanie pogody (zaokrąglenie do godziny)
df_weather_hourly = df_weather_2015.withColumn("WeatherDate", col("DATE"))\
    .withColumn("WeatherHour", hour(col("WeatherDate")))\
    .withColumn("DateOnly", to_timestamp(date_trunc("day", col("WeatherDate"))))


print("Łączenie z pogodą...")
df_joined = df_flights_fixed.join(
    df_weather_hourly,
    (to_timestamp(date_trunc("day", df_flights_fixed.ScheduledTimestamp)) == df_weather_hourly.DateOnly) & 
    (df_flights_fixed.Hour == df_weather_hourly.WeatherHour),
    "left"
)

print("Statystyki opóźnień w zależności od pogody (JFK):")
display(df_joined.select("DEPARTURE_DELAY", "HOURLYVISIBILITY", "HOURLYWindSpeed", "HOURLYPrecip").summary())

In [0]:
from pyspark.sql.functions import round, avg, col

df_wind_analysis = df_joined.filter(col("DEPARTURE_DELAY").isNotNull()) \
    .groupBy(round(col("HOURLYWindSpeed")).alias("WindSpeed")) \
    .agg(avg("DEPARTURE_DELAY").alias("AvgDelay"), count("*").alias("FlightCount")) \
    .orderBy("WindSpeed")

display(df_wind_analysis)

df_wind_analysis_cancel= df_joined.filter(col("DEPARTURE_DELAY").isNotNull()) \
    .groupBy(round(col("HOURLYWindSpeed")).alias("WindSpeed")) \
    .agg(avg("DEPARTURE_DELAY").alias("AvgDelay"), count("*").alias("FlightCount")) \
    .orderBy("WindSpeed")

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import round, avg, col

df_vis_analysis = df_joined.filter(col("DEPARTURE_DELAY").isNotNull()) \
    .groupBy(round(col("HOURLYVISIBILITY")).alias("Visibility")) \
    .agg(avg("DEPARTURE_DELAY").alias("AvgDelay"), count("*").alias("FlightCount")) \
    .orderBy("Visibility")

display(df_vis_analysis)

Databricks visualization. Run in Databricks to view.

In [0]:
df_time_analysis = df_joined.filter(col("DEPARTURE_DELAY").isNotNull()) \
    .groupBy("Hour") \
    .agg(avg("DEPARTURE_DELAY").alias("AvgDelay")) \
    .orderBy("Hour")

display(df_time_analysis)

Databricks visualization. Run in Databricks to view.