### READ JSON DATA (BATCH)

In [0]:
_df = spark.read.format('json') \
			.option('inferschema', True) \
			.option('multiLine',True) \
			.load('/Volumes/workspace/stream/streamingvolume/jsonsource')

In [0]:
_df.display()

customer,items,metadata,order_id,payment,timestamp
"List(List(Toronto, Canada, M5H 2N2), 501, john@example.com, John Doe)","List(List(I100, 25.99, Wireless Mouse, 2), List(I101, 15.49, USB-C Adapter, 1))","List(List(campaign, back_to_school), List(channel, email))",ORD1001,"List(Credit Card, TXN7890)",2025-06-01T10:15:00Z


In [0]:
_df.printSchema()

root
 |-- customer: struct (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- postal_code: string (nullable = true)
 |    |-- customer_id: long (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- price: double (nullable = true)
 |    |    |-- product_name: string (nullable = true)
 |    |    |-- quantity: long (nullable = true)
 |-- metadata: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- payment: struct (nullable = true)
 |    |-- method: string (nullable = true)
 |    |-- transaction_id: string (nullable = true)


In [0]:
_df.select('customer.customer_id','customer.name','customer.address.city','customer.address.country','order_id','timestamp').display()

customer_id,name,city,country,order_id,timestamp
501,John Doe,Toronto,Canada,ORD1001,2025-06-01T10:15:00Z


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *
#
_df.select('items','customer.customer_id','customer.name','customer.address.city','customer.address.country','order_id','timestamp') \
	.withColumn('items',explode_outer(col('items'))) \
	.withColumn('item_id', col('items.item_id')) \
	.withColumn('item_price', col('items.price')) \
	.select('*') \
	.display()

items,customer_id,name,city,country,order_id,timestamp,item_id,item_price
"List(I100, 25.99, Wireless Mouse, 2)",501,John Doe,Toronto,Canada,ORD1001,2025-06-01T10:15:00Z,I100,25.99
"List(I101, 15.49, USB-C Adapter, 1)",501,John Doe,Toronto,Canada,ORD1001,2025-06-01T10:15:00Z,I101,15.49


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *
#
_df.select('*') \
	.withColumn('metadata',explode_outer(col('metadata'))) \
	.withColumn('metadata_key', col('metadata.key')) \
	.withColumn('metadata_value', col('metadata.key')) \
	.drop(col('metadata')) \
	.select('*') \
	.select(['order_id','timestamp','metadata_key','metadata_value']) \
	.show(truncate=False)

+--------+--------------------+------------+--------------+
|order_id|timestamp           |metadata_key|metadata_value|
+--------+--------------------+------------+--------------+
|ORD1001 |2025-06-01T10:15:00Z|campaign    |campaign      |
|ORD1001 |2025-06-01T10:15:00Z|channel     |channel       |
+--------+--------------------+------------+--------------+



# Let's Read Streamimg Data 😁

In [0]:
## instead using .option('inferschema', True), we'll use:
# spark.conf.set('spark.sql.streaming.schemaInference', True)
#
## ERROR = [CONFIG_NOT_AVAILABLE] Configuration spark.sql.streaming.schemaInference is not available. SQLSTATE: 42K0I

###### for error:
<br>

```plaintext
[INFINITE_STREAMING_TRIGGER_NOT_SUPPORTED] Trigger type ProcessingTime is not supported for this cluster type.
Use a different trigger type e.g. AvailableNow, Once. SQLSTATE: 0A000
```

###### you need to use an alternative trigger type that is compatible with your cluster. The error message specifically suggests `AvailableNow` or `Once`.

- `AvailableNow`: This trigger processes all data that is available at the start of the query and then stops. If new data arrives later, it will not be processed until the next time the query is restarted with AvailableNow. This is suitable for incremental batch processing where you want to process all currently available data in a single run.

```python
.trigger(availableNow=True)
```

- `Once`: Similar to `AvailableNow`, this trigger processes a single batch of available data and then stops. It is considered deprecated in favor of `AvailableNow` for most use cases, but it can still be used if explicitly required for a single-batch execution.

```python
.trigger(once=True)
```

In [0]:


#
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *
#
_jsonSchema = \
StructType([StructField('customer'
			,StructType([StructField('address'
				,StructType([StructField('city',StringType(),True)
							,StructField('country',StringType(),True)
							,StructField('postal_code',StringType(),True)]),True)
				,StructField('customer_id',LongType(),True)
				,StructField('email',StringType(),True)
				,StructField('name',StringType(),True)]),True)
			,StructField('items'
					,ArrayType(StructType(
								[	StructField('item_id',StringType(),True)
									,StructField('price',DoubleType(),True)
									,StructField('product_name',StringType(),True)
									,StructField('quantity',LongType(),True)]),True),True)
			,StructField('metadata'
					,ArrayType(StructType(
								[	StructField('key',StringType(),True)
									,StructField('value',StringType(),True)]),True),True)
			,StructField('order_id',StringType(),True)
			,StructField('payment',StructType(
										[StructField('method',StringType(),True)
										,StructField('transaction_id',StringType(),True)]),True)
			,StructField('timestamp',StringType(),True)])
#
#
_df = spark.readStream.format('json') \
			.option('multiLine',True) \
			.schema(_jsonSchema) \
			.load('/Volumes/workspace/stream/streamingvolume/jsonsource')
#
_dfToWrite = _df.select('*') \
	.withColumn('items',explode_outer(col('items'))) \
	.withColumn('item_id', col('items.item_id')) \
	.withColumn('item_price', col('items.price')) \
	.withColumn('product_name', col('items.product_name')) \
	.withColumn('quantity', col('items.quantity')) \
	.drop(col('items')) \
	.withColumn('metadata',explode_outer(col('metadata'))) \
	.withColumn('metadata_key', col('metadata.key')) \
	.withColumn('metadata_value', col('metadata.value')) \
	.drop(col('metadata')) \
	.withColumn('payment_method', col('payment.method')) \
	.withColumn('transaction_id', col('payment.transaction_id')) \
	.drop(col('payment')) \
	.select(	[
						'customer.address.city'
					,	'customer.address.country'
					,	'customer.address.postal_code'
					,	'customer.customer_id'
					,	'customer.email'
					,	'customer.name'
					,	'item_id'
					,	'item_price'
					,	'product_name'
					,	'quantity'
					,	'order_id'
					,	'payment_method'
					,	'transaction_id'
					,	'timestamp'
				])
#
_dfToWrite.writeStream.format('delta') \
			.outputMode('append') \
			.trigger(once=True) \
			.option('path','/Volumes/workspace/stream/streamingvolume/jsonsink/Data') \
			.option('checkpointLocation','/Volumes/workspace/stream/streamingvolume/jsonsink/Checkpoint') \
			.start()
#

<pyspark.sql.connect.streaming.query.StreamingQuery at 0xff8b65681510>

In [0]:
%sql
SELECT * FROM delta.`/Volumes/workspace/stream/streamingvolume/jsonsink/Data/`

city,country,postal_code,customer_id,email,name,item_id,item_price,product_name,quantity,order_id,payment_method,transaction_id,timestamp
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I100,25.99,Wireless Mouse,2,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I100,25.99,Wireless Mouse,2,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I101,15.49,USB-C Adapter,1,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I101,15.49,USB-C Adapter,1,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z


In [0]:
#
_dfToWrite.writeStream.format('delta') \
			.outputMode('append') \
			.trigger(once=True) \
			.option('path','/Volumes/workspace/stream/streamingvolume/jsonsink/Data') \
			.option('checkpointLocation','/Volumes/workspace/stream/streamingvolume/jsonsink/Checkpoint') \
			.start()
#

<pyspark.sql.connect.streaming.query.StreamingQuery at 0xff8b855c5e50>

In [0]:
%sql
SELECT * FROM delta.`/Volumes/workspace/stream/streamingvolume/jsonsink/Data/`

city,country,postal_code,customer_id,email,name,item_id,item_price,product_name,quantity,order_id,payment_method,transaction_id,timestamp
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I100,25.99,Wireless Mouse,2,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I100,25.99,Wireless Mouse,2,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I101,15.49,USB-C Adapter,1,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z
Toronto,Canada,M5H 2N2,501,john@example.com,John Doe,I101,15.49,USB-C Adapter,1,ORD1001,Credit Card,TXN7890,2025-06-01T10:15:00Z
Vancouver,Canada,V5K 0A1,502,alice@example.com,Alice Smith,I102,45.0,Bluetooth Keyboard,1,ORD1002,PayPal,TXN7891,2025-06-01T10:30:00Z
Vancouver,Canada,V5K 0A1,502,alice@example.com,Alice Smith,I102,45.0,Bluetooth Keyboard,1,ORD1002,PayPal,TXN7891,2025-06-01T10:30:00Z
