In [0]:
from config.paths import SECRETS_FILE
from src.utils.locate_filepaths import storage_filepaths
import json
from src.etl.climate.extract import extract_dmi_metObs
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import current_timestamp, col
from datetime import datetime, timezone


dbutils.widgets.text("source", "")
source = dbutils.widgets.get("source")
print(f"Notebook triggered by: {source}")

# Loading storage filepaths
filepaths = storage_filepaths()

# Loading dmi api key
with open(SECRETS_FILE) as f:
    secrets = json.load(f)

# Schema for dmi-climate data
schema = StructType([
    StructField('created', StringType(), True),
    StructField('observed', StringType(), True),
    StructField('parameterId', StringType(), True),
    StructField('stationId', StringType(), True),
    StructField('value', DoubleType(), True)
])

df_climate_dmi = spark.createDataFrame([], schema)

# Fetching data from dmi
url = "https://dmigw.govcloud.dk/v2/metObs/collections/observation/items"
parameterIds = ["sun_last1h_glob","temp_mean_past1h","temp_soil_mean_past1h",'precip_past1h']
bbox="12.3,55.6,12.7,56.0"
period='latest-month'
limit=100000
secret_key=secrets["metObs-api-key"]

# Get today's date at midnight (UTC), formatting as ISO 8601 with 'Z' at the end
today = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
today = today.strftime('%Y-%m-%dT%H:%M:%SZ')
today = f'../{today}'

for i in parameterIds:    
    data = extract_dmi_metObs(secret_key=secret_key, url=url, parameterId=i, bbox=bbox, datetime=today, limit=limit)
    features = data.get('features')
    records = [item['properties'] for item in features]
    temp_df = spark.createDataFrame(records, schema)
    df_climate_dmi = df_climate_dmi.union(temp_df)

df_climate_dmi = df_climate_dmi.withColumn("ingestion_timestamp", current_timestamp())
df_climate_dmi.write.mode('overwrite').partitionBy("ingestion_timestamp").parquet(f"{filepaths['bronze']}/dmi_climate/daily_extract/")

In [0]:
df_climate_dmi.count()

In [0]:
from pyspark.sql.functions import col
df_climate_dmi.orderBy(col("observed").asc()).show()