In [0]:
service_credential = dbutils.secrets.get(scope="ws-scope",key="weather-sp-secret")

spark.conf.set("fs.azure.account.auth.type.jcystorage.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.jcystorage.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.jcystorage.dfs.core.windows.net", "a2210f7a-e661-45c1-98f4-af64e6d2df9f")
spark.conf.set("fs.azure.account.oauth2.client.secret.jcystorage.dfs.core.windows.net", service_credential)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.jcystorage.dfs.core.windows.net", "https://login.microsoftonline.com/a3b377f7-39e7-4ca3-bf66-ea94bbdcdbf3/oauth2/token")

In [0]:
from datetime import datetime, timedelta
from pyspark.sql.functions import from_json, col, inline
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, DoubleType

spark.sql("""
CREATE TABLE IF NOT EXISTS processed_dates (
    processed_date DATE, 
    process_step STRING
)
""")

landing_dir = "abfss://weather-container-landing@jcystorage.dfs.core.windows.net/data/"
staging_dir = "abfss://weather-container-staging@jcystorage.dfs.core.windows.net/data/"

last_processed = spark.sql("SELECT MAX(processed_date) FROM processed_dates").collect()[0][0]

if last_processed is None:
    last_processed = datetime.now().date() - timedelta(days=20)


landing = dbutils.fs.ls(landing_dir)

dates_to_process = []
for dirs in landing:
    try:
        dir_date = datetime.strptime(dirs.name.strip('/'), "%Y-%m-%d").date()
        if dir_date > last_processed:
            dates_to_process.append(dir_date)
    except ValueError:
        pass

array_schema = ArrayType(
    StructType([
        StructField("date", StringType(), True),
        StructField("temperature_2m", DoubleType(), True)
    ])
)
 
for date in dates_to_process:
    try:
        raw_weather_df = spark.read.json(f"{landing_dir}/{date}")
        weather_df_parsed = raw_weather_df.withColumn("data", from_json(col("data"), array_schema))
        weather_df_inline = weather_df_parsed.select("latitud", "longitud", inline(col("data")))
        weather_df_formated = weather_df_inline.withColumn("temp_date", col("date")[0:10]).withColumn("temp_hour", col("date")[12:8]).drop("date")
        weather_df_ready_to_staging = weather_df_formated.dropDuplicates()

        try:
            weather_df_ready_to_staging.write.mode("overwrite").save(f"{staging_dir}/{date}")

        except Exception as e:
            print(f"Error writing to staging: {e}")
            pass


        spark.sql(f"""INSERT INTO processed_dates (processed_date, process_step) VALUES('{date}', 'staging')""")
        
    except Exception as e:
        print(f"Error processing {date}: {e}")
        pass
