In [None]:
#Read sftp environment and copy to raw
from datetime import *
from pyspark.sql.functions import *
import delta #from delta import *
import requests, json
import azure.storage.blob #from azure.storage.blob import BlobServiceClient
import notebookutils
from pyspark.sql import Window, DataFrame
from pyspark.sql.types import * 

In [None]:
%run /utils/common_functions

In [None]:
#Added 2025-09-11 KETTNECH to resolve production date issue
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")

#### Get the current files from RAW container

In [None]:
daily_files = []
#prefix = f"PIM_AkeneoExport_Barcode-UPC_2025-06-17"
previous_day = (date.today() - timedelta(days=1))
prefix = f"PIM_Akeneo_Full_Export_Limited_Fields_{str(previous_day.strftime('%Y-%m-%d'))}"

files = mssparkutils.fs.ls(f"{raw_adls_path}Akeneo/")

daily_files = [file for file in files if file.name.startswith(prefix)]

# Check if there are any files to move
if len(daily_files) == 0 and (env_var == env_dict['prod'] or env_var == env_dict['prod_backup']):
    # Warn there might be an issue:
    print("No files found to move. Ending the job and sending notification.")
    response = requests.post(
        'https://prod-85.eastus.logic.azure.com:443/workflows/7367758ef3da4d76b4e64670220d6135/triggers/manual/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2Fmanual%2Frun&sv=1.0&sig=3qXzAxaCqincRFQ358oeDwq_SGn5vgzaNgW26QxUDMs',
        '{"email_to": "ededl@wwgroups.net", "email_subject": "No Akeneo Data Available to Process", "email_body": "Did not find any file to process for daily Akeneo data in RAW/Akeneo.", "email_from": "AzureSynapse@wwwinc.com"}',
        headers={"Content-Type": "application/json"}
    )
    # Exit the notebook when no files are available
    notebookutils.mssparkutils.notebook.exit(0)
elif len(daily_files) == 0 and (env_var == env_dict['dev'] or env_var == env_dict['test']):
    # For Dev and Test environments
    print("No files found in Dev/Test environment. Skipping file processing.")
    # Exit the notebook when no files are available
    notebookutils.mssparkutils.notebook.exit(0)
else:
    # Proceed if there are files
    for file in daily_files:
        print(f"Processing File: {file.name}")

### Process Akeneo CSV files from RAW to BRONZE (OVERWRITE only)

In [None]:
# Get the  latest file raw, overwrite the delta table (full export)
akeneo_raw_df = spark.read.option("header", "true").option("quote", "\"").option("multiLine", "true").csv(f"{raw_adls_path}Akeneo/{prefix}*.csv", header=True, inferSchema=True, sep=";").withColumn("source_file_path", input_file_name()).distinct()
if "CategoryName" in akeneo_raw_df.columns:
    # To accomodate this logic from Akeneo: CategoryName is coming from PLM and is technically the sub-category, 
    #       Category_Name is generated by a rule in Akeneo and used for display purposes in the product grid to know which Akeneo main category they are assigned belong to.  
    #       Category_Name is assigned based on the category code from PLM.
    # Renaming to SubCategoryName since there's already existing SubCategory in the schema
    akeneo_raw_df = akeneo_raw_df.withColumnRenamed("CategoryName", "Sub_Category_Name")
akeneo_raw_df.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{bronze_adls_path}Akeneo')

### Perform Melting using Transpose, Stack and Explode functions

In [None]:
def melt_data(df, new_col_name, cols_to_transpose):
    # Create an array of structs
    # Each struct will contain the column name and its value
    stacked_df = akeneo_raw_df.select("Barcode", array(*[struct(lit(c).alias("key"), akeneo_raw_df[c].alias(new_col_name)) for c in cols_to_transpose]).alias("stacked_data"))

    #Explose the array to create new rows
    unpivoted_df = stacked_df.select("Barcode", explode(stacked_df["stacked_data"]).alias("data"))

    #Extract key and value from the struct
    final_df = unpivoted_df.select("Barcode", "data.key", f"data.{new_col_name}")
    final_df = final_df.withColumn("Region", when(final_df.key.contains("US"), "US").when(final_df.key.contains("GB"), "GB").when(final_df.key.contains("IE"), "IE").otherwise("SB"))
    return final_df

In [None]:
# Get the Regional Columns [US, IE, GB, SB]
regional_columns = [col for col in akeneo_raw_df.columns if ("-en_" in col) ]

# Dictionary that will contain the new column name and list of related column per region
merge_regional_columns = {}

# list all the columns to drop from the raw dataframe
cols_to_drop = [] #Add CategoryName due to duplicate column issue from RAW file

for col in regional_columns:
    new_name = col.replace("-en_GB", "").replace("-en_IE", "").replace("-en_US", "").replace("-SB", "")
    if new_name in merge_regional_columns:
        item = merge_regional_columns.get(new_name)
        item.append(col)
    else:
        merge_regional_columns[new_name] = [col]
    cols_to_drop.append(col)

# Create a dataframe from the list of barcode
stacked_df = akeneo_raw_df.select("Barcode").distinct()
# Initialize a column for ALL region
stacked_df = stacked_df.withColumn("Region", explode(lit(['US', 'IE', 'GB'])))

for key in merge_regional_columns:
    cols_to_transpose = merge_regional_columns.get(key)
    # Melt the columns into rows
    new_df = melt_data(akeneo_raw_df.select("Barcode", *cols_to_transpose), key, cols_to_transpose)
    # Join stacked data to the existing barcode list
    stacked_df = stacked_df.join(new_df.select("Barcode", "Region", key).dropna().distinct(), ["Barcode","Region"], "left")

normalized_df = stacked_df.dropna(how="all", subset=list(merge_regional_columns.keys()))
#normalized_df.orderBy("BarCode", "Region").show(100, truncate=False)

In [None]:
# Merged normalized_df to raw data
akeneo_filtered_df = akeneo_raw_df.drop(*cols_to_drop)
akeneo_processed_df = akeneo_filtered_df.join(normalized_df, "Barcode", "inner")

In [None]:
# TODO: Normalized data type of all columns before storing to silver layer
from pyspark.sql.functions import col

# Convert date and timestamp columns
normalized_akeneo_df = akeneo_processed_df\
    .withColumn("created", col("created").cast(TimestampType()))\
    .withColumn("updated", col("created").cast(TimestampType()))\
    .withColumn("Barcode", col("Barcode").cast(StringType()))\
    .withColumn("Coming_Soon_Ecommerce", col("Coming_Soon_Ecommerce").cast(StringType()))\
    .withColumn("John_Lewis_Exclusive", col("John_Lewis_Exclusive").cast(StringType()))\
    .withColumn("Latest_Launch_Date_Ecommerce", to_date(col("Latest_Launch_Date_Ecommerce"), "MM/dd/yyyy"))\
    .withColumn("Online_From_Barcode_Ecommerce", to_date(col("Online_From_Barcode_Ecommerce"), "MM/dd/yyyy"))\
    .withColumn("Online_To_Barcode_Ecommerce", to_date(col("Online_To_Barcode_Ecommerce"), "MM/dd/yyyy"))\
    .withColumn("Online_From_Ecommerce", to_date(col("Online_From_Ecommerce"), "MM/dd/yyyy"))\
    .withColumn("Online_To_Ecommerce", to_date(col("Online_To_Ecommerce"), "MM/dd/yyyy"))\
##    .withColumn("Web_Base_Price-USD", col("Web_Base_Price-USD").cast(DecimalType()))\
##    .withColumn("Web_Base_Price-GBP", col("Web_Base_Price-GBP").cast(DecimalType()))\
##    .withColumn("Web_Base_Price-EUR", col("Web_Base_Price-EUR").cast(DecimalType()))\
##    .withColumn("Web_Sale_Price-USD", col("Web_Sale_Price-USD").cast(DecimalType()))\
##   .withColumn("Web_Sale_Price-GBP", col("Web_Sale_Price-GBP").cast(DecimalType()))\
##    .withColumn("Web_Sale_Price-EUR", col("Web_Sale_Price-EUR").cast(DecimalType()))\
##    .withColumn("Weight", col("Weight").cast(DoubleType()))\
##    .withColumn("Available_Flag_Ecommerce", col("Available_Flag_Ecommerce").cast(IntegerType()))\

# Apply PascalCase naming convention
columns_list = normalized_akeneo_df.columns
akeneo_silver_naming = {}
hashdiff_cols = []
for col in columns_list:
    new_name = str(col).replace("-", " ").replace("_", " ").title().replace(" ", "")
    akeneo_silver_naming[col] = new_name
    if new_name != "Created":
        hashdiff_cols.append(new_name)

normalized_akeneo_df = normalized_akeneo_df.withColumnsRenamed(akeneo_silver_naming).distinct()

# Calculate Derived Columns
# IS_ONLINE_IN_DATE - OnlineFlagEcommerce = 1 and OnlineFromEcommerce and OnlineToEcommerce dates are within current date
# IS_ONLINE_IN_DATE_BARCODE - OnlineFlagBarcode = 1 and OnlineFromBarcodeEcommerce and OnlineToBarcodeEcommerce dates are within current date
normalized_akeneo_df = normalized_akeneo_df.withColumn("IsOnlineInDate", when((trim(normalized_akeneo_df.OnlineFlagEcommerce)=='1' ) & (lit(current_date()).between(normalized_akeneo_df.OnlineFromEcommerce, normalized_akeneo_df.OnlineToEcommerce)), 1).otherwise(0))
normalized_akeneo_df = normalized_akeneo_df.withColumn("IsOnlineInDateBarcode", when((trim(normalized_akeneo_df.OnlineFlagBarcode)=='1' ) & (lit(current_date()).between(normalized_akeneo_df.OnlineFromBarcodeEcommerce, normalized_akeneo_df.OnlineToBarcodeEcommerce)), 1).otherwise(0))

hashdiff_cols = normalized_akeneo_df.columns
normalized_akeneo_df = normalized_akeneo_df.withColumn("ProductHashDiff", md5(concat_ws("||",*hashdiff_cols)))

### Append Historical data to Silver Delta

In [None]:
# Silver layer: We need to keep the historical delta table for every snapshot (date column) of 1st day of the month
firstOfMon = date(previous_day.year, previous_day.month, 1)
normalized_akeneo_df = normalized_akeneo_df.withColumn("SnapshotDate", lit(firstOfMon))

## Append as delta table
if delta.DeltaTable.isDeltaTable(spark, f'{silver_adls_path}AkeneoHistory'):
    if previous_day == firstOfMon:
        print(f"First Day of Month: {firstOfMon}")
        normalized_akeneo_df.write.option("mergeSchema", "true").mode("append").format("delta").save(f'{silver_adls_path}AkeneoHistory')
else:
    normalized_akeneo_df.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{silver_adls_path}AkeneoHistory')

### Overwrite Gold Delta (Latest record only)

In [None]:
normalized_akeneo_df.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{gold_adls_path}Akeneo')

In [None]:
notebookutils.mssparkutils.notebook.exit(0)

## Data Validation

In [None]:
normalized_akeneo_df.printSchema()

In [None]:
spark.read.format("delta").load(f'{gold_adls_path}Akeneo').createOrReplaceTempView("vwAkeneoGold")
spark.read.format("delta").load(f'{bronze_adls_path}Akeneo').createOrReplaceTempView("vwAkeneoBronze")

In [None]:
def manual_delta_column_renamed(column_mapping, delta_table_path):
    df = spark.read.format("delta").load(delta_table_path)
    # Rename the column
    for old_name, new_name in column_mapping.items():
        if old_name in df.columns:  # Check if the column exists
            df = df.withColumnRenamed(old_name, new_name)

    df.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(delta_table_path)

In [None]:
# manual column name change for bronze table
column_rename_map = {
    "Country_Of_Origin-en_GB-SB_Ecommerce": "Country_Of_Origin-en_GB",
    "Country_Of_Origin-en_IE-SB_Ecommerce": "Country_Of_Origin-en_IE",
    "Country_Of_Origin-en_US-SB_Ecommerce": "Country_Of_Origin-en_US",
    "HTS_Code": "hsCode",
    "Unit_of_Measurement": "unitMeasure",
    "Weight-unit": "weight_uom"
        # Add more mappings as needed
}

manual_delta_column_renamed(column_rename_map, f'{bronze_adls_path}Akeneo')

print(f"Columns {column_rename_map} in Delta table at '{bronze_adls_path}Akeneo' has been ranamed.")

In [None]:
# manual column name change for bronze table
column_rename_map = {
    "CountryOfOriginEcommerce": "CountryOfOrigin",
    "HtsCode": "Hscode",
    "UnitOfMeasurement": "Unitmeasure",
    "WeightUnit": "WeightUom"
        # Add more mappings as needed
}

manual_delta_column_renamed(column_rename_map, f'{gold_adls_path}Akeneo')

print(f"Columns {column_rename_map} in Delta table at '{gold_adls_path}Akeneo' has been ranamed.")


In [None]:
# Manually add new columns to delta table
df = spark.read.format("delta").load(f'{gold_adls_path}Akeneo')
new_df = df.withColumn("Badgecode", lit(None).cast(StringType()))
new_df = new_df.withColumn("ProductMedia", lit(None).cast(StringType()))
new_df.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f'{gold_adls_path}Akeneo')

### convert silver table data type

In [None]:
df = spark.read.format("delta").load(f'{silver_adls_path}AkeneoHistory')
df2 = df.withColumn("OriginalLaunchDateSbEcommerce", to_date("OriginalLaunchDateSbEcommerce","yyyy-MM-dd")) \
        .withColumn("Weight", df["Weight"].cast("double")) \
        .withColumn("OnlineFlagEcommerce", df["OnlineFlagEcommerce"].cast("int")) \
        .withColumn("OnlineFlagBarcode", df["OnlineFlagBarcode"].cast("int")) \
        .withColumn("DroppedEcommerce", df["DroppedEcommerce"].cast("int")) \
        .withColumn("SearchableEcommerce", df["SearchableEcommerce"].cast("int")) 
df2.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{silver_adls_path}AkeneoHistory')
#df2.printSchema()