In [1]:
from delta import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, LongType
from datetime import datetime
import json
from azure.storage.blob import BlobServiceClient
from notebookutils import mssparkutils
# SFTP
import paramiko
import gzip
import io
from pyspark.sql.functions import col, when, array, explode, expr
import re
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException

In [12]:
%run /utils/common_functions

In [None]:
def get_json_keys(schema, prefix):
    """Recursively fetches all the keys from a complex JSON schema, including nested structures and arrays of structs"""
    keys = []
    for field in schema.fields:
        if isinstance(field.dataType, StructType):
            if prefix:
                new_prefix = f"{prefix}.{field.name}"
            else:
                new_prefix = field.name
            keys += get_json_keys(field.dataType, new_prefix)
        elif isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StructType):
            if prefix:
                new_prefix = f"{prefix}.{field.name}"
            else:
                new_prefix = field.name
            keys += get_json_keys(field.dataType.elementType, new_prefix)
        else:
            if prefix:
                keys.append(f"{prefix}.{field.name}")
            else:
                keys.append(field.name)
    # Return a list of strings representing the path to each key in the JSON object
    return keys

In [None]:
# ???

# define the dynamic schema
dynamic_schema = spark.read.json(mParticle_raw_df.rdd.map(lambda row: row.value)).schema

# List of keys that can be used to access specific fields in the JSON data\r\n",
mParticle_cols = get_json_keys(dynamic_schema, "json_construct")

# Convert JSON strings to structured data\r\n",
mParticle_df = mParticle_raw_df.withColumn("json_construct", from_json(col("value"), dynamic_schema)).select("json_construct")

In [5]:
# Read a regular file (RatingValues is Struct) and an array file (RatingValues is Array(Struct)).
regular_file_path = f"abfss://raw@azwwwnonproddevadapadls.blob.core.windows.net/BazaarVoice/date=20250307/bv_sweatybetty_incremental_standard_client_feed_20250305.xml.gz"
array_file_path = "abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/date=20250308/bv_sweatybetty_incremental_standard_client_feed_20250306.xml.gz"

# Get the Product elements
print(f"Reading XML from file {regular_file_path}...")
df_raw_product_regular = spark.read.format("xml") \
    .option("rowTag", "Product") \
    .load(regular_file_path)

# Get the Product elements
print(f"Reading XML from file {array_file_path}...")
df_raw_product_array = spark.read.format("xml") \
    .option("rowTag", "Product") \
    .load(array_file_path)

# Reviews **********************************************************************
# Explode the reviews and filter to only get the Product attributes we need
#df_reviews = df_raw_product.select("_id", "_disabled", "_removed", explode(col("Reviews.Review")).alias("product_reviews"))



In [6]:
# Explode the reviews 
df_product_regular_exploded = df_raw_product_regular.select("_id", "_disabled", "_removed", explode(col("Reviews.Review")).alias("product_reviews"))
df_product_array_exploded = df_raw_product_array.select("_id", "_disabled", "_removed", explode(col("Reviews.Review")).alias("product_reviews"))

In [7]:
# Now we can explore normalizing the schema.
df_product_regular_exploded.printSchema()

In [8]:
df_product_array_exploded.printSchema()

In [18]:
# df_final_regular = df_product_regular_exploded.select(
#     #col("*"),  # Keep all other columns
#     when(
#         col("product_reviews.RatingValues.RatingValue").isNotNull() & (size(col("product_reviews.RatingValues.RatingValue")) == 0),
#         array(col("product_reviews.RatingValues.RatingValue"))
#     ).otherwise(
#         col("product_reviews.RatingValues.RatingValue")
#     ).alias("rating_values")
# )

# df_final_regular = df_product_regular_exploded.selectExpr(
#     "*", 
#     """CASE 
#         WHEN typeof(product_reviews.RatingValues.RatingValue) = 'struct' 
#         THEN array(product_reviews.RatingValues.RatingValue) 
#         ELSE product_reviews.RatingValues.RatingValue 
#        END AS rating_values"""
# )

df_regular_normalized = df_product_regular_exploded.select(
    to_json(col("product_reviews.RatingValues.RatingValue")).alias("rating_values")
)

# Explore the application of schema to XML

## Create an XML document with some elements that could be an array or a struct

In [18]:
# This doc has the key characteristics we need to addres:
# 1) sub-elements that could be inferred as a single element struct on an array of structs
# 2) sub-elements that have variable structure (sem-structured)
xml_doc = """<Products>
    <Product id="P1001" status="active">
        <Name>Wireless Mouse</Name>
        <Price currency="USD">29.99</Price>
        <Categories>
            <Category>Electronics</Category>
            <Category>Computer Accessories</Category>
        </Categories>
        <Tags>
            <Tag>wireless</Tag>
            <Tag>mouse</Tag>
        </Tags>
        <Specifications>
            <Width>6.2</Width>
            <Height>3.4</Height>
            <Depth>1.5</Depth>
            <Weight unit="grams">85</Weight>
        </Specifications>
        <Manufacturer name="TechCo" country="USA" />
    </Product>
    <Product id="P1002" status="inactive">
        <Name>USB-C Hub</Name>
        <Price currency="USD">49.95</Price>
        <Categories>
            <Category>Computer Accessories</Category>
        </Categories>
        <Tags>
            <Tag>usb-c</Tag>
        </Tags>
        <Specifications>
            <Ports>4</Ports>
            <SupportedOS>
                <OS>Windows</OS>
                <OS>MacOS</OS>
                <OS>Linux</OS>
            </SupportedOS>
        </Specifications>
        <Manufacturer name="GizmoCorp" country="China" />
    </Product>
</Products>
"""

In [19]:
# write the XML to a temp file
from notebookutils import mssparkutils

abfs_path = "abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/temp/peter_test.xml"

# Write the XML string to the ABFS path
mssparkutils.fs.put(abfs_path, xml_doc, overwrite=True)

In [20]:
# Schema for it - generated.
# This handles variable schema by including all the variants in the possible StructFields

from pyspark.sql.types import *

product_schema = StructType([
    StructField("@id", StringType(), True),
    StructField("@status", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Price", StructType([
        StructField("@currency", StringType(), True),
        StructField("_VALUE", StringType(), True)
    ]), True),
    StructField("Categories", StructType([
        StructField("Category", ArrayType(StringType()), True)
    ]), True),
    StructField("Tags", StructType([
        StructField("Tag", ArrayType(StringType()), True)
    ]), True),
    StructField("Specifications", StructType([
        StructField("Width", StringType(), True),
        StructField("Height", StringType(), True),
        StructField("Depth", StringType(), True),
        StructField("Weight", StructType([
            StructField("@unit", StringType(), True),
            StructField("_VALUE", StringType(), True)
        ]), True),
        StructField("Ports", StringType(), True),
        StructField("SupportedOS", StructType([
            StructField("OS", ArrayType(StringType()), True)
        ]), True)
    ]), True),
    StructField("Manufacturer", StructType([
        StructField("@name", StringType(), True),
        StructField("@country", StringType(), True)
    ]), True)
])

In [24]:
# Read the XML doc using the schema
df = (
    spark.read.format("xml")
    .option("rowTag", "Product")
    .option("attributePrefix", "@")
    .schema(product_schema)
    .load(abfs_path)
)

In [25]:
# Now work with it
df.printSchema()
df.show(truncate=False) # FAIL: Input path does not exist: abfss://azwwwnonproddevadapsyn01@azwwwnonproddevadapadls.dfs.core.windows.net/tmp/tmp_wbr6opt.xml
# Why fail when we call show() rather than fail when we read it above? Bizarre.


# Another simpler XML test - just to test the array(struct) vs struct.

In [9]:
xml_doc = """<Products>
    <Product id="P1001" status="active">
        <Name>Wireless Mouse</Name>
        <Price currency="USD">29.99</Price>
        <Categories>
            <Category>Electronics</Category>
            <Category>Computer Accessories</Category>
        </Categories>
    </Product>
    <Product id="P1002" status="inactive">
        <Name>USB-C Hub</Name>
        <Price currency="USD">49.95</Price>
        <Categories>
            <Category>Computer Accessories</Category>
        </Categories>
    </Product>
</Products>
"""

In [10]:
# write the XML to a temp file
from notebookutils import mssparkutils

abfs_path = "abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/temp/peter_test2.xml"

# Write the XML string to the ABFS path
mssparkutils.fs.put(abfs_path, xml_doc, overwrite=True)

In [11]:
# define our schema
from pyspark.sql.types import *

product_schema = StructType([
    StructField("@id", StringType(), True),
    StructField("@status", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Price", StructType([
        StructField("@currency", StringType(), True),
        StructField("_VALUE", StringType(), True)
    ]), True),
    StructField("Categories", StructType([
        StructField("Category", ArrayType(StringType()), True)
    ]), True),
])

In [12]:
df_simple_products = (
    spark
    .read
    .format("xml")
    .option("rowTag", "Product")
    .option("attributePrefix", "@")
    .schema(product_schema)
    .load(abfs_path)
)



In [13]:
df_simple_products.printSchema()
df_simple_products.show(truncate=False)

# REMOVE THIS CODE FROM HERE DOWN

In [27]:
# Look at Tylers bronze AccountList
abfss_path = 'abfss://bronze@azwwwnonproddevadapsyn01@azwwwnonproddevadapadls.dfs.core.windows.net/GL/AccountList'
df_account_list = spark.read.format("delta").load(abfss_path)

In [30]:
from pyspark.sql.functions import col

# Define a regex for valid integers
valid_int = "^[0-9]+$"

# Good rows: both columns are valid integers
# df_valid = df_account_list.filter(
#     col("CorporateSortOrder").rlike(valid_int) & 
#     col("RetailSortOrder").rlike(valid_int)
# )

# Bad rows: either column is invalid
df_invalid = df_account_list.filter(
    ~col("CorporateSortOrder").rlike(valid_int) |
    ~col("RetailSortOrder").rlike(valid_int)
)

df_invalid.select("Key", "CorporateSortOrder", "RetailSortOrder").show(truncate=False)