# Transform orders data - string to json
1. Preprocess the json string to fix the data quality issues
2. Transform json string to json object
3. Write transformed data to the silver schema

## 1. Preprocess the json string to fix the data quality issues
Documentation: https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/functions/regexp_replace

In [0]:
import pyspark.sql.functions as F


df_orders = spark.table("gizmobox.bronze.py_orders")
df_orders_fixed = df_orders.select(
  F.regexp_replace(
    F.col("value"), 
    '"order_date": (\\d{4}-\\d{2}-\\d{2})', 
    '"order_date": "$1"'
    ).alias("fixed_value")
)

display(df_orders_fixed)

## 2. Transform json string to json object
* Function [schema_of_json](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/functions/schema_of_json)
* Function [from_json](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/functions/from_json)

In [0]:
# %sql
# select
#   schema_of_json(fixed_value)
# from tv_orders_fixed
# limit 1;

schema = df_orders_fixed.select(
  F.schema_of_json(F.col('fixed_value'))
)

display(schema.limit(1))

In [0]:
# %sql
# select
#   from_json(
#     fixed_value,
#     'STRUCT<customer_id: BIGINT, items: ARRAY<STRUCT<category: STRING, details: STRUCT<brand: STRING, color: STRING>, item_id: BIGINT, name: STRING, price: BIGINT, quantity: BIGINT>>, order_date: STRING, order_id: BIGINT, order_status: STRING, payment_method: STRING, total_amount: BIGINT, transaction_timestamp: STRING>') -- provide the schema from schema_of_json 
#     AS json_value
# from tv_orders_fixed;

order_schema = """
STRUCT<customer_id: BIGINT, items: ARRAY<STRUCT<category: STRING, details: STRUCT<brand: STRING, color: STRING>, item_id: BIGINT, name: STRING, price: BIGINT, quantity: BIGINT>>, order_date: STRING, order_id: BIGINT, order_status: STRING, payment_method: STRING, total_amount: BIGINT, transaction_timestamp: STRING>
"""

df_orders_parsed = df_orders_fixed.select(
  F.from_json(F.col('fixed_value'), order_schema).alias('json_value')
)

display(df_orders_parsed.limit(10))

## 3. Write transformed data to the silver schema

In [0]:
df_orders_parsed.writeTo('gizmobox.silver.py_orders_json').createOrReplace()

In [0]:
%sql
select *
from gizmobox.silver.py_orders_json;