# Transform orders data - Explode arrays
1. Access elements from the json object
2. Deduplicate array elements
3. Explode arrays
4. Write the transformed data to Silver schema

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

df_orders = spark.table('gizmobox.silver.py_orders_json')

display(df_orders.limit(10))

## 1. Access elements from the JSON object & deduplicate item arrays

In [0]:
# %sql
# select
#   json_value.order_id,
#   json_value.order_status,
#   json_value.payment_method,
#   json_value.total_amount,
#   json_value.transaction_timestamp,
#   json_value.customer_id,
#   json_value.items
# from gizmobox.silver.orders_json;

df_orders_normalized = df_orders.select(
  F.col('json_value.order_id').alias('order_id'),
  F.col('json_value.order_status').alias('order_status'),
  F.col('json_value.payment_method').alias('payment_method'),
  F.col('json_value.total_amount').alias('total_amount'),
  F.col('json_value.transaction_timestamp').alias('transaction_timestamp'),
  F.col('json_value.customer_id').alias('customer_id'),
  F.array_distinct(F.col('json_value.items')).alias('items')
)

display(df_orders_normalized.limit(10))


## 2. Explode arrays
[explode](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/functions/explode) function

In [0]:
df_orders_normalized = df_orders_normalized.select(
  'order_id',
  'order_status',
  'payment_method',
  'total_amount',
  'transaction_timestamp',
  'customer_id',
  F.explode('items').alias('item')
)

display(df_orders_normalized.limit(10))


In [0]:
# %sql
# select
#   order_id,
#   order_status,
#   payment_method,
#   total_amount,
#   transaction_timestamp,
#   customer_id,
#   item.item_id,
#   item.name,
#   item.price,
#   item.quantity,
#   item.category,
#   item.details.brand,
#   item.details.color
# from tv_orders_exploded;

df_orders_normalized = df_orders_normalized.select(
  'order_id',
  'order_status',
  'payment_method',
  'total_amount',
  'transaction_timestamp',
  'customer_id',
  F.col('item.item_id').alias('item_id'),
  F.col('item.name').alias('name'),
  F.col('item.price').alias('price'),
  F.col('item.quantity').alias('quantity'),
  F.col('item.category').alias('category'),
  F.col('item.details.brand').alias('brand'),
  F.col('item.details.color').alias('color')
)

display(df_orders_normalized.limit(10))

## 4. Write the transformed data to silver schema

In [0]:
df_orders_normalized.writeTo('gizmobox.silver.py_orders').createOrReplace()

In [0]:
%sql
select
  * 
from gizmobox.silver.py_orders; 