In [0]:
# Retrieve the task values from the previous task (bronze)
bronze_output = dbutils.jobs.taskValues.get(taskKey = "Bronze", key = "bronze_output")

# Accessing values 

start_date = bronze_output.get("start_date", "")
bronze_adls = bronze_output.get("bronze_adls", "")
silver_adls = bronze_output.get("silver_adls", "")

print(f"Start Date: {start_date}, Bronze ADLS: {bronze_adls}, Silver ADLS: {silver_adls}")


[0;36m  File [0;32m<command-4647007806381645>, line 10[0;36m[0m
[0;31m    print(f"{}")[0m
[0m               ^[0m
[0;31mSyntaxError[0m[0;31m:[0m f-string: empty expression not allowed


In [0]:
from pyspark.sql.functions import when, isnull, col
from pyspark.sql.types import TimestampType
from datetime import date, timedelta

In [0]:
df = spark.read.option("multiline", "true").json(f"{bronze_adls}{start_date}_earthquake_data.json")

In [0]:
df.head()

In [0]:
# Reshaping data

df = (
    df.select(
        'id',
        col('geometry.coordinates').getItem(0).alias('longitude'),
        col('geometry.coordinates').getItem(1).alias('latitude'),
        col('geometry.coordinates').getItem(2).alias('elevation'),
        col('properties.title').alias('title'),
        col('properties.place').alias('place_description'),
        col('properties.mag').alias('mag'),
        col('properties.sig').alias('sig'),
        col('properties.magType').alias('magType'),
        col('properties.updated').alias('updated'),
        col('properties.time').alias('time')
    )
)


In [0]:
df.head()

In [0]:
# Data validation : missing or null values

df = (
    df
    .withColumn('longitude', when(isnull(col('longitude')), 0).otherwise(col('longitude')))
    .withColumn('latitude', when(isnull(col('latitude')), 0).otherwise(col('latitude')))
    .withColumn('time', when(isnull(col('time')), 0).otherwise(col('time')))
)

In [0]:
# Converting time from unix

df = (
    df
    .withColumn('time', (col('time') /1000).cast(TimestampType()))
    .withColumn('updated', (col('updated') /1000).cast(TimestampType()))
)

In [0]:
df.head()

In [0]:
silver_output_path = f"{silver_adls}earthquake_events_silver/"

In [0]:
df.write.mode("append").parquet(silver_output_path)

In [0]:
dbutils.jobs.taskValues.set(key = "silver_output", value = silver_output_path)