Skip to content

Commit

Permalink
[DOP-13846] - add bypass test and note for rootTag
Browse files Browse the repository at this point in the history
  • Loading branch information
maxim-lixakov committed May 2, 2024
1 parent 89f691f commit c626067
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
5 changes: 5 additions & 0 deletions onetl/file/format/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,11 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column:
This method assumes that the ``spark-xml`` package is installed and properly configured within your Spark environment.
.. note::
This method does not support XML strings with a root tag that is not specified as the ``rowTag``. If your XML data includes a root tag that encapsulates multiple row tags, ensure to preprocess the XML string to remove or ignore the root tag before parsing.
Parameters
----------
column : str | Column
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ def test_xml_parse_column(
file_df_dataframe,
file_df_schema,
):
from pyspark.sql.functions import col, expr
from pyspark.sql.types import StructType

from onetl.file.format import XML

spark_version = get_spark_version(spark)
Expand All @@ -188,8 +191,11 @@ def test_xml_parse_column(
xml_data = file.read()

df = spark.createDataFrame([(xml_data,)], ["xml_string"])
df.show(truncate=False)
xml = XML.parse({"rowTag": "item", "rootTag": "root"})
parsed_df = df.select(xml.parse_column("xml_string", schema=file_df_schema))
# remove the <root> tag from the XML string
df = df.withColumn("xml_string", expr("regexp_replace(xml_string, '^<root>|</root>$', '')"))

parsed_df.show(truncate=False)
xml = XML(row_tag="item")
parsed_df = df.select(xml.parse_column("xml_string", schema=file_df_schema))
assert isinstance(parsed_df.schema, StructType)
# check that there are no columns with only null values
assert parsed_df.filter(col("xml_string").isNotNull()).count() > 0

0 comments on commit c626067

Please sign in to comment.