# Czyszczenie zbiorów danych df_flights i df_weather, obsługa outlierów oraz wartości NULL

## Dostępy


In [0]:
sas_token = "sp=rcwdl&st=2026-01-02T21:18:29Z&se=2026-01-03T05:33:29Z&spr=https&sv=2024-11-04&sr=c&sig=U85UUMwZI2b3er9g1fw3DRdJu41ma0XSWv911SXkXh0%3D"
storage_account_name = "newadbprojektkakastorage"
container_name = "data"

spark.conf.set(
    f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SAS"
)
spark.conf.set(
    f"fs.azure.sas.token.provider.type.{storage_account_name}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
)
spark.conf.set(
    f"fs.azure.sas.fixed.token.{storage_account_name}.dfs.core.windows.net",
    sas_token
)

base_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net"
print(f"Skonfigurowano dostęp do: {base_path}")

Skonfigurowano dostęp do: abfss://data@newadbprojektkakastorage.dfs.core.windows.net


## Wczytanie zbiorów danych

In [0]:
from pyspark.sql.functions import col, to_date, to_timestamp, month, dayofweek, hour, when, count, avg, round, lit, concat

df_flights = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/flights.csv")

df_weather = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(f"{base_path}/jfk_weather_cleaned.csv") 

print("Liczba wierszy w flights:", df_flights.count())
display(df_flights.limit(5))

Liczba wierszy w flights: 5819079


YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2015,1,1,4,AS,98,N407AS,ANC,SEA,5,2354,-11,21,15,205,194,169,1448,404,4,430,408,-22,0,0,,,,,,
2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2,-8,12,14,280,279,263,2330,737,4,750,741,-9,0,0,,,,,,
2015,1,1,4,US,840,N171US,SFO,CLT,20,18,-2,16,34,286,293,266,2296,800,11,806,811,5,0,0,,,,,,
2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15,-5,15,30,285,281,258,2342,748,8,805,756,-9,0,0,,,,,,
2015,1,1,4,AS,135,N527AS,SEA,ANC,25,24,-1,11,35,235,215,199,1448,254,5,320,259,-21,0,0,,,,,,


# Czyszczenie i inżynieria danych na zbiorze df_flights
## Filtrowanie po lotnisku "JFK"

In [0]:
df_jfk = df_flights.filter(col("ORIGIN_AIRPORT") == "JFK")

print("Liczba wierszy w df_jfk:", df_jfk.count())
display(df_jfk.limit(3))

Liczba wierszy w df_jfk: 93811


YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
2015,1,1,4,B6,2023,N324JB,JFK,SJU,535,618,43,13,631,225,201,184,1598,1035,4,1020,1039,19,0,0,,0.0,0.0,19.0,0.0,0.0
2015,1,1,4,AA,2299,N3LLAA,JFK,MIA,545,640,55,17,657,185,199,152,1089,929,30,850,959,69,0,0,,14.0,0.0,55.0,0.0,0.0
2015,1,1,4,B6,939,N794JB,JFK,BQN,545,545,0,17,602,221,202,180,1576,1002,5,1026,1007,-19,0,0,,,,,,


## Usunięcie opóźnień spowodowanych przez inne czynniki niż pogoda

In [0]:
from pyspark.sql.functions import col, lit, coalesce

# Usuwamy wiersze, gdzie inne opóźnienia są większe niż 0
df_weather_delays = df_jfk.filter(
    (coalesce(col("AIR_SYSTEM_DELAY"), lit(0)) == 0) &
    (coalesce(col("SECURITY_DELAY"), lit(0)) == 0) &
    (coalesce(col("AIRLINE_DELAY"), lit(0)) == 0) &
    (coalesce(col("LATE_AIRCRAFT_DELAY"), lit(0)) == 0)
).select("YEAR","MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","SCHEDULED_DEPARTURE","DEPARTURE_TIME","DEPARTURE_DELAY","DISTANCE","CANCELLED", "ORIGIN_AIRPORT","WEATHER_DELAY")

print("Liczba wierszy:", df_jfk.count())
display(df_weather_delays.limit(5))

Liczba wierszy: 93811


YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,DISTANCE,CANCELLED,ORIGIN_AIRPORT,WEATHER_DELAY
2015,1,1,4,B6,939,545,545,0,1576,0,JFK,
2015,1,1,4,B6,353,600,554,-6,1028,0,JFK,
2015,1,1,4,B6,583,600,557,-3,944,0,JFK,
2015,1,1,4,B6,525,600,554,-6,1005,0,JFK,
2015,1,1,4,DL,421,600,605,5,760,0,JFK,


## Sprawdzenie duplikatów

In [0]:
# Sprawdzenie duplikatów 
duplicate_count = df_weather_delays.count() - df_weather_delays.dropDuplicates().count()
print(f"Liczba zduplikowanych wierszy: {duplicate_count}")

Liczba zduplikowanych wierszy: 0


## Czyszczenie i zamiana na poprawny format daty i godziny

In [0]:
# Czy daty i godziny są wczytywane jako stringi lub liczby?
df_weather_delays.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- WEATHER_DELAY: integer (nullable = true)



In [0]:
from pyspark.sql.functions import col, lpad, concat, to_timestamp, lit, when, substring

def clean_time_column(column_name):
    padded = lpad(col(column_name).cast("string"), 4, '0')
    return when(padded == '2400', '0000').otherwise(padded)

df_jfk_cleaned = df_weather_delays \
    .withColumn("CleanScheduled", clean_time_column("SCHEDULED_DEPARTURE")) \
    .withColumn("CleanDeparture", clean_time_column("DEPARTURE_TIME"))

df_jfk_timestamps = df_jfk_cleaned.withColumn(
    "ScheduledTimestamp",
    to_timestamp(
        concat(
            col("YEAR"), lit("-"), 
            lpad(col("MONTH"), 2, '0'), lit("-"), 
            lpad(col("DAY"), 2, '0'), lit(" "), 
            substring(col("CleanScheduled"), 1, 2), lit(":"),
            substring(col("CleanScheduled"), 3, 2), lit(":00") 
        ),
        "yyyy-MM-dd HH:mm:ss"
    )
).withColumn(
    "DeparturedTimestamp",
    to_timestamp(
        concat(
            col("YEAR"), lit("-"), 
            lpad(col("MONTH"), 2, '0'), lit("-"), 
            lpad(col("DAY"), 2, '0'), lit(" "), 
            substring(col("CleanDeparture"), 1, 2), lit(":"), 
            substring(col("CleanDeparture"), 3, 2), lit(":00") 
        ),
        "yyyy-MM-dd HH:mm:ss"
    )
)

df_jfk_timestamps_clean = df_jfk_timestamps.drop("CleanScheduled", "CleanDeparture", "YEAR", "MONTH", "DAY", 
    "SCHEDULED_DEPARTURE", "DEPARTURE_TIME")

print("Sprawdzenie konwersji czasu (bez błędu 24:00):")
display(df_jfk_timestamps.select(
    "YEAR", "MONTH", "DAY", 
    "SCHEDULED_DEPARTURE", "ScheduledTimestamp", 
    "DEPARTURE_TIME", "DeparturedTimestamp"
).limit(3))

Sprawdzenie konwersji czasu (bez błędu 24:00):


YEAR,MONTH,DAY,SCHEDULED_DEPARTURE,ScheduledTimestamp,DEPARTURE_TIME,DeparturedTimestamp
2015,1,1,545,2015-01-01T05:45:00Z,545,2015-01-01T05:45:00Z
2015,1,1,600,2015-01-01T06:00:00Z,554,2015-01-01T05:54:00Z
2015,1,1,600,2015-01-01T06:00:00Z,557,2015-01-01T05:57:00Z


## Obsługa wartości NULL

In [0]:
from pyspark.sql.functions import col, count, when

exprs = [count(when(col(c).isNull(), c)).alias(c) for c in df_jfk_timestamps_clean.columns]
df_null_counts = df_jfk_timestamps_clean.select(*exprs)

row = df_null_counts.first()

cols_with_nulls = [c for c in df_null_counts.columns if row[c] > 0]

if cols_with_nulls:
    display(df_null_counts.select(*cols_with_nulls))
else:
    print("Brak nulli.")

DEPARTURE_DELAY,WEATHER_DELAY,DeparturedTimestamp
1878,73551,1878


In [0]:
# Analiza wartości NULL 
# Sprawdzamy, czy braki w 'DEPARTURE_DELAY' pokrywają się z odwołanymi lotami ('CANCELLED' == 1)

df_jfk_total_nulls = df_jfk_timestamps_clean.select(
    count(when(col("DEPARTURE_DELAY").isNull(), 1)).alias("Total_Null_DepDelay")
)
display(df_jfk_total_nulls)
df_integrity = df_jfk_timestamps_clean.groupBy("CANCELLED").agg(
    count("*").alias("Total"),
    count("DEPARTURE_DELAY").alias("NonNull_DepDelay"),
    count(when(col("DEPARTURE_DELAY").isNull(), 1)).alias("Null_DepDelay")
)
display(df_integrity)


Total_Null_DepDelay
1878


CANCELLED,Total,NonNull_DepDelay,Null_DepDelay
1,1922,44,1878
0,71949,71949,0


Dla wszystkich pustych wartości w DEPARTURE_DELAY lot został odwołany.


## Analiza i obsługa outlierów

In [0]:
df_jfk_no_nulls = df_jfk_timestamps_clean.filter(col("DEPARTURE_DELAY").isNotNull())
display(df_jfk_no_nulls.limit(10))
print(f"Liczba wierszy: {df_jfk_no_nulls.count()}")

DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DEPARTURE_DELAY,DISTANCE,CANCELLED,ORIGIN_AIRPORT,WEATHER_DELAY,ScheduledTimestamp,DeparturedTimestamp
4,B6,939,0,1576,0,JFK,,2015-01-01T05:45:00Z,2015-01-01T05:45:00Z
4,B6,353,-6,1028,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:54:00Z
4,B6,583,-3,944,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:57:00Z
4,B6,525,-6,1005,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:54:00Z
4,DL,421,5,760,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T06:05:00Z
4,UA,415,0,2586,0,JFK,,2015-01-01T06:05:00Z,2015-01-01T06:05:00Z
4,B6,601,-2,1069,0,JFK,,2015-01-01T06:05:00Z,2015-01-01T06:03:00Z
4,B6,1403,-2,1598,0,JFK,,2015-01-01T06:14:00Z,2015-01-01T06:12:00Z
4,US,433,-2,2153,0,JFK,,2015-01-01T06:30:00Z,2015-01-01T06:28:00Z
4,B6,23,-3,2475,0,JFK,,2015-01-01T06:30:00Z,2015-01-01T06:27:00Z


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Liczba wierszy: 71993


In [0]:
# Sprawdzenie anomalii w opóźnieniach
# Outliers skrajnie niskie (np. -30 i mniej)
outliers_low = df_jfk_timestamps_clean.filter(
    col("DEPARTURE_DELAY") < -30
).select(
    "AIRLINE", "ORIGIN_AIRPORT",  "DEPARTURE_DELAY", "ScheduledTimestamp","DeparturedTimestamp"
)

# Outliers skrajnie wysokie (np. > 6h)
outliers_high = df_jfk_timestamps_clean.filter(
    col("DEPARTURE_DELAY") > 6 * 60
).select(
    "AIRLINE", "ORIGIN_AIRPORT", "DEPARTURE_DELAY", "ScheduledTimestamp","DeparturedTimestamp"
)

# Połączenie
outliers = outliers_low.union(outliers_high)
display(outliers)

display(df_jfk_timestamps_clean.limit(10))

AIRLINE,ORIGIN_AIRPORT,DEPARTURE_DELAY,ScheduledTimestamp,DeparturedTimestamp
DL,JFK,616,2015-02-02T15:30:00Z,2015-02-02T01:46:00Z
B6,JFK,440,2015-02-19T16:30:00Z,2015-02-19T23:50:00Z
B6,JFK,415,2015-04-20T08:50:00Z,2015-04-20T15:45:00Z
AA,JFK,372,2015-06-16T14:55:00Z,2015-06-16T21:07:00Z


DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DEPARTURE_DELAY,DISTANCE,CANCELLED,ORIGIN_AIRPORT,WEATHER_DELAY,ScheduledTimestamp,DeparturedTimestamp
4,B6,939,0,1576,0,JFK,,2015-01-01T05:45:00Z,2015-01-01T05:45:00Z
4,B6,353,-6,1028,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:54:00Z
4,B6,583,-3,944,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:57:00Z
4,B6,525,-6,1005,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:54:00Z
4,DL,421,5,760,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T06:05:00Z
4,UA,415,0,2586,0,JFK,,2015-01-01T06:05:00Z,2015-01-01T06:05:00Z
4,B6,601,-2,1069,0,JFK,,2015-01-01T06:05:00Z,2015-01-01T06:03:00Z
4,B6,1403,-2,1598,0,JFK,,2015-01-01T06:14:00Z,2015-01-01T06:12:00Z
4,US,433,-2,2153,0,JFK,,2015-01-01T06:30:00Z,2015-01-01T06:28:00Z
4,B6,23,-3,2475,0,JFK,,2015-01-01T06:30:00Z,2015-01-01T06:27:00Z


In [0]:
# Statystyki opisowe dla opóźnień
display(df_jfk_timestamps_clean.select("DEPARTURE_DELAY").summary())

summary,DEPARTURE_DELAY
count,71993.0
mean,0.1804064284027614
stddev,11.83203446957632
min,-22.0
25%,-5.0
50%,-2.0
75%,1.0
max,616.0


In [0]:
# Obliczamy granice IQR
IQR = df_jfk_timestamps_clean.approxQuantile("DEPARTURE_DELAY", [0.05, 0.95], 0.01)
print(f"Granice IQR: {IQR[0]} - {IQR[1]}")
lower_bound = IQR[0]
upper_bound = IQR[1]

df_jfk_no_outliers = df_jfk_timestamps_clean.filter(
    (col("DEPARTURE_DELAY") >= lower_bound) & 
    (col("DEPARTURE_DELAY") <= upper_bound)
)

print(f"Liczba wierszy przed: {df_jfk_timestamps_clean.count()}")
print(f"Liczba wierszy po: {df_jfk_no_outliers.count()}")   


Granice IQR: -8.0 - 17.0
Liczba wierszy przed: 73871
Liczba wierszy po: 64777


In [0]:
# Statystyki opisowe dla opóźnień
display(df_jfk_no_outliers.select("DEPARTURE_DELAY").summary())

summary,DEPARTURE_DELAY
count,64777.0
mean,-1.2637201475832471
stddev,5.264604985437449
min,-8.0
25%,-5.0
50%,-3.0
75%,0.0
max,17.0


# Czyszczenie i inżynieria danych na zbiorze df_weather
## Filtrowanie po danych tylko z 2015 roku


In [0]:
from pyspark.sql.functions import col, count, desc,year 

df_weather_2015 = df_weather.filter(year(col("DATE")) == 2015)

print("Próbka danych")
display(df_weather_2015.limit(3))

Próbka danych


DATE,HOURLYVISIBILITY,HOURLYDRYBULBTEMPF,HOURLYWETBULBTEMPF,HOURLYDewPointTempF,HOURLYRelativeHumidity,HOURLYWindSpeed,HOURLYStationPressure,HOURLYSeaLevelPressure,HOURLYPrecip,HOURLYAltimeterSetting,HOURLYWindDirectionSin,HOURLYWindDirectionCos,HOURLYPressureTendencyIncr,HOURLYPressureTendencyDecr,HOURLYPressureTendencyCons
2015-01-01T00:00:00Z,10.0,30.0,24.0,11.0,45.0,15.0,30.18,30.2,0.0,30.2,-0.984808,-0.173648,0,1,0
2015-01-01T01:00:00Z,10.0,29.0,24.0,11.0,47.0,13.0,30.16,30.18,0.0,30.18,-0.984808,-0.173648,0,1,0
2015-01-01T02:00:00Z,10.0,29.0,24.0,13.0,51.0,14.0,30.14,30.17,0.0,30.16,-0.939693,-0.34202,0,1,0


## Wybór interesujących nas kolumn - wpływ prędkości wiatru, widoczności, opadów atmosferycznych

In [0]:
df_weather_impact = df_weather_2015.select("DATE","HOURLYPrecip","HOURLYVISIBILITY","HOURLYWindSpeed")

## Analiza wartości pustych i duplikatów

In [0]:

from pyspark.sql.functions import count, when

exprs_null = [count(when(col(c).isNull(), c)).alias(c) for c in df_weather_impact.columns]


row_nulls = df_weather_impact.select(*exprs_null).first().asDict()
null_data = [(k, v) for k, v in row_nulls.items() if v > 0] 

print("Kolumny zawierające NULL-e")
if null_data:
    df_null_report = spark.createDataFrame(null_data, ["Kolumna", "Liczba_Nulli"])
    display(df_null_report.orderBy(col("Liczba_Nulli").desc()))
else:
    print("Brak wartości NULL (uwaga: puste stringi '' nie są liczone jako NULL!)")


total_count = df_weather_impact.count()
distinct_count = df_weather_impact.distinct().count()
duplicate_count = total_count - distinct_count

print(f"Całkowita liczba wierszy: {total_count}")
print(f"Liczba unikalnych wierszy: {distinct_count}")
print(f"Liczba zduplikowanych wierszy: {duplicate_count}")


Kolumny zawierające NULL-e
Brak wartości NULL (uwaga: puste stringi '' nie są liczone jako NULL!)
Całkowita liczba wierszy: 8760
Liczba unikalnych wierszy: 8760
Liczba zduplikowanych wierszy: 0


# Połączenie obu zbiorów danych w jedno i zapisanie do pliku

In [0]:
from pyspark.sql.functions import col, to_timestamp, concat, lit, lpad, expr, hour, date_trunc

df_flights_fixed = df_jfk_no_outliers.withColumn("Hour", hour(col("ScheduledTimestamp"))) 

df_weather_hourly = df_weather_impact.withColumn("WeatherDate", col("DATE"))\
    .withColumn("WeatherHour", hour(col("WeatherDate")))\
    .withColumn("DateOnly", to_timestamp(date_trunc("day", col("WeatherDate"))))


print("Łączenie z pogodą...")
df_joined = df_flights_fixed.join(
    df_weather_hourly,
    (to_timestamp(date_trunc("day", df_flights_fixed.ScheduledTimestamp)) == df_weather_hourly.DateOnly) & 
    (df_flights_fixed.Hour == df_weather_hourly.WeatherHour),
    "left"
)

print("Statystyki opóźnień w zależności od pogody (JFK):")
display(df_joined.select("DEPARTURE_DELAY", "HOURLYVISIBILITY", "HOURLYWindSpeed", "HOURLYPrecip").summary())

Łączenie z pogodą...
Statystyki opóźnień w zależności od pogody (JFK):


summary,DEPARTURE_DELAY,HOURLYVISIBILITY,HOURLYWindSpeed,HOURLYPrecip
count,64777.0,64777.0,64777.0,64777.0
mean,-1.2637201475832471,9.439413526405955,11.048535745712211,0.0019658212019698
stddev,5.264604985437456,1.8933584838420932,5.711735776783478,0.0180070521260744
min,-8.0,0.0,0.0,0.0
25%,-5.0,10.0,7.0,0.0
50%,-3.0,10.0,10.0,0.0
75%,0.0,10.0,15.0,0.0
max,17.0,10.0,37.0,1.67


In [0]:
df_joined.printSchema()

root
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- WEATHER_DELAY: integer (nullable = true)
 |-- ScheduledTimestamp: timestamp (nullable = true)
 |-- DeparturedTimestamp: timestamp (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- DATE: timestamp (nullable = true)
 |-- HOURLYPrecip: double (nullable = true)
 |-- HOURLYVISIBILITY: double (nullable = true)
 |-- HOURLYWindSpeed: double (nullable = true)
 |-- WeatherDate: timestamp (nullable = true)
 |-- WeatherHour: integer (nullable = true)
 |-- DateOnly: timestamp (nullable = true)



In [0]:
display(df_joined.limit(3))

DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DEPARTURE_DELAY,DISTANCE,CANCELLED,ORIGIN_AIRPORT,WEATHER_DELAY,ScheduledTimestamp,DeparturedTimestamp,Hour,DATE,HOURLYPrecip,HOURLYVISIBILITY,HOURLYWindSpeed,WeatherDate,WeatherHour,DateOnly
4,B6,939,0,1576,0,JFK,,2015-01-01T05:45:00Z,2015-01-01T05:45:00Z,5,2015-01-01T05:00:00Z,0.0,10.0,14.0,2015-01-01T05:00:00Z,5,2015-01-01T00:00:00Z
4,B6,353,-6,1028,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:54:00Z,6,2015-01-01T06:00:00Z,0.0,10.0,16.0,2015-01-01T06:00:00Z,6,2015-01-01T00:00:00Z
4,B6,583,-3,944,0,JFK,,2015-01-01T06:00:00Z,2015-01-01T05:57:00Z,6,2015-01-01T06:00:00Z,0.0,10.0,16.0,2015-01-01T06:00:00Z,6,2015-01-01T00:00:00Z


In [0]:
df_weather_jfk = df_joined.drop("DATE", "DateOnly","WeatherDate","WeatherHour","Hour")


In [0]:
output_path = f"{base_path}/final_results/df_weather_jfk.csv"

df_weather_jfk.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(output_path)

print(f"Zapisano do: {output_path}")


Zapisano do: dbfs:/FileStore/tables/df_weather_jfk.csv


In [0]:
output_path = f"{base_path}/final_results/df_weather_jfk_with_upper_outliers.csv"

df_weather_jfk.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(output_path)

print(f"Zapisano do: {output_path}")



Zapisano do: abfss://data@newadbprojektkakastorage.dfs.core.windows.net/final_results/df_weather_jfk_with_upper_outliers.csv
