In [0]:
import sys
sys.path.append("/Workspace/Repos/js-julian.salinas@outlook.com/aquiles-etl-pipeline")

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DateTransformation").getOrCreate()

In [0]:
# Read the table into a DataFrame
df = spark.read.table("workspace.default.products_from_csv")
df.display()

In [0]:
from pyspark.sql.functions import udf, col, initcap, when
from pyspark.sql.types import DecimalType, StringType, StructField, StructType
from core.data_processor import infer_and_transform_date, transform_price, remove_special_characters, separate_camel_case, transform_provider_name, extract_measure_and_unit


# Define the schema for the returned STRUCT type
measure_unit_schema = StructType([
    StructField("Measure", StringType(), True),
    StructField("UnitOfMeasure", StringType(), True),
    StructField("PackageUnits", StringType(), True)
])

# Register the UDFs
infer_and_transform_date_udf = udf(infer_and_transform_date, StringType())
transform_price_udf = udf(transform_price, DecimalType())
remove_special_characters_udf = udf(remove_special_characters, StringType())
separate_camel_case_udf = udf(separate_camel_case, StringType())
transform_provider_name_udf = udf(transform_provider_name, StringType())
extract_measure_and_unit_udf = udf(extract_measure_and_unit, measure_unit_schema)

# Convert data types using the UDFs
df = df.withColumn("RawPrice", col("Price")) \
       .withColumn("CleanPrice", transform_price_udf(col("Price"))) \
       .withColumn("RawLastReviewDt", col("LastReviewDt")) \
       .withColumn("CleanLastReviewDt", infer_and_transform_date_udf(col("LastReviewDt"))) \
       .withColumn("RawDescription", col("Description")) \
       .withColumn("CleanDescription", remove_special_characters_udf(col("Description"))) \
       .withColumn("CleanProviderName", initcap(transform_provider_name_udf(col("ProviderName")))) \
       .withColumn("IsValidPrice", when(col("Price").isNull(), False).otherwise(True))

df.display()

In [0]:
from pyspark.sql.functions import lower

# Convert data types using the UDFs
df = df.withColumn("MeasureUnit", extract_measure_and_unit_udf(col("Description"))) \
       .withColumn("Measure", col("MeasureUnit.Measure")) \
       .withColumn("UnitOfMeasure", col("MeasureUnit.UnitOfMeasure")) \
       .withColumn("PackageUnits", col("MeasureUnit.PackageUnits")) \
       .withColumn("UnitOfMeasure", lower(col("UnitOfMeasure")))

df.display()