In [0]:
from src.utils.locate_filepaths import storage_filepaths
from pyspark.sql.functions import avg, when, col, unix_timestamp
from pyspark.sql.window import Window
from src.etl.climate.feature_engineering import average_column_over_window, evaluate_null_columns

# Loading data from silver layer.
paths = storage_filepaths()
df_climate = spark.read.format("delta").load(f"{paths['silver']}/dmi_climate_data")

# Grouping the features into average daily observations and all stations.
# Averaging across astations as the covered area is not large (northern zealand + Copenhagen).
df_climate = df_climate.groupBy('observed_date').agg(avg('sun_last1h_glob').alias('daily_avg_sun'), avg('temp_mean_past1h').alias('daily_avg_air_temp'), avg('temp_soil_mean_past1h').alias('daily_avg_soil_temp'), avg('precip_past1h').alias('daily_avg_rain'))

# Adding columns: Precipication sum last week & Precipication sum last 2 weeks & Precipication last 4 weeks.
# As it is believed that the precipication over time has a significant influence on the water level.
for i in [7,14,30]:
    df_climate = average_column_over_window(df=df_climate,
                                            exist_col_name='daily_avg_rain',
                                            new_col_name=f'{i}days_avg_rain',
                                            time_days=i)

# Substituting null values with -999
# Adding not null columns for each feature column, so linear regression is able disregard columns with null values (-999).
df_climate = evaluate_null_columns(df_climate,-999)


# Exporting to gold layer.
#df_climate.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{paths['gold']}/climate_for_ml/")
#display(df_climate)

In [0]:
display(df_climate)