In [0]:
from src.utils.locate_filepaths import storage_filepaths
from pyspark.sql.functions import avg, when, col


# Loading data from silver layer.
paths = storage_filepaths()
df_climate = spark.read.format("delta").load(f"{paths['silver']}/dmi_climate_data")

# Grouping the features into average daily observations and all stations.
# Averaging across astations as the covered area is not large (northern zealand + Copenhagen).
df_climate = df_climate.groupBy('observed_date').agg(avg('sun_last1h_glob').alias('daily_avg_sun'), avg('temp_mean_past1h').alias('daily_avg_air_temp'), avg('temp_soil_mean_past1h').alias('daily_avg_soil_temp'), avg('precip_past1h').alias('daily_avg_rain'))

# Substituting null values with -999
# Adding not null columns for each feature column, so linear regression is able disregard columns with null values (-999).
for c in df_climate.columns:
    if c not in ['observed_date']:
        df_climate = df_climate.withColumn(f"{c}_notnull", when(col(c).isNotNull(), 1).otherwise(0))
df_climate = df_climate.fillna(-999)

# Exporting to gold layer.
df_climate.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{paths['gold']}/climate_for_ml/")
#display(df_climate)