# Post Processing
This notebook will take care of making any final changes to the columns. For example we needed to capitalize BrandCountryKey and replace Hyphen with space.

In [1]:
%run /utils/common_functions

In [2]:
account_name = raw_adls_path.split('@')[1].split('.')[0]

# Location of Sweaty Betty Data

In [3]:
gold_container = 'gold'
gold_sb_sessions_folder = 'GA4/Sessions_SweatyBetty'
gold_delta_table_path_sb = f"abfss://{gold_container}@{account_name}.dfs.core.windows.net/{gold_sb_sessions_folder}"
print(gold_delta_table_path_sb)

# Location of Wolverine Data

In [4]:

gold_container = 'gold'
gold_wolverine_sessions_folder_API = 'GA4/Sessions_wolverine_summary_API_Final'
gold_delta_table_path_wolverine_API = f"abfss://{gold_container}@{account_name}.dfs.core.windows.net/{gold_wolverine_sessions_folder_API}"
print(gold_delta_table_path_wolverine_API)

# Read Sweaty Betty Data

In [5]:
# Read the Delta table from the specified path
df_sb = spark.read.format("delta").load(gold_delta_table_path_sb)

# Display the DataFrame (in Synapse this will render a table)
display(df_sb)


In [6]:
df_sb.printSchema()

# Read Wolverine Data

In [7]:
# Read the Delta table from the specified path
df_WWW = spark.read.format("delta").load(gold_delta_table_path_wolverine_API)

# Display the DataFrame (in Synapse this will render a table)
#display(df_WWW)


In [8]:
df_WWW.printSchema()

# Removing Duplicates from Wolverine Data

In [9]:
www_unique = df_WWW.dropDuplicates()
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Define window partitioned by BrandCountryKey, device_type, calday
window_spec = Window.partitionBy("BrandCountryKey", "device_type", "calday") \
                    .orderBy(F.desc("sessions"))

# Rank rows in each group by sessions (highest first)
df_ranked = www_unique.withColumn("rank", F.row_number().over(window_spec))

# Keep only the top row from each group
www_best = df_ranked.filter(F.col("rank") == 1).drop("rank")

# Sort for display
www_best = www_best.orderBy("BrandCountryKey", "device_type", "calday")

www_best = www_best.drop_duplicates()



# Replace CHACOS with CHACO

In [10]:
from pyspark.sql import functions as F

www_best = (
    www_best
    .withColumn(
        "BrandCountryKey",
        F.regexp_replace(F.col("BrandCountryKey"), r"^CHACOS", "CHACO")
    )
    .dropDuplicates()
)


# Define function to capitalize and remove Hyphen    
    Transforms the BrandCountryKey column and saves the DataFrame as a Delta table.

    Transformations:
    - Capitalize all values in BrandCountryKey
    - Replace hyphens with spaces in BrandCountryKey

    Parameters:
    - df: Input Spark DataFrame
    - output_path: Output path where the transformed DataFrame should be saved as Delta
    

In [11]:
#Additional Transformations for WWW META SHOPPING and Hytest-B2B2C US
'''
from pyspark.sql.functions import upper, regexp_replace, when, col

def transform_and_save(df, output_path):
    # Apply initial uppercase and replace hyphens with spaces
    transformed_df = df.withColumn(
        "BrandCountryKey",
        upper(regexp_replace("BrandCountryKey", "-", " "))
    )

    # Apply specific replacements
    transformed_df = transformed_df.withColumn(
        "BrandCountryKey",
        when(col("BrandCountryKey").contains("SWEATYBETTY"),
             regexp_replace(col("BrandCountryKey"), "SWEATYBETTY", "SWEATY BETTY"))
        .when(col("BrandCountryKey") == "WWW META SHOPPING", "Meta US")
        .when(col("BrandCountryKey") == "WOLVERINE4WORK", "Hytest-B2B2C US")
        .otherwise(col("BrandCountryKey"))
    )

    # Overwrite and save as Delta
    transformed_df.write.format("delta").mode("overwrite").save(output_path)

    print(f"Data successfully written to {output_path}")
'''

In [12]:
#Addional replacements: Merrell EMEA Emerging to 'Merrell IE', and Saucony EMEA Emerging to 'Saucony IE'

from pyspark.sql.functions import upper, regexp_replace, when, col

def transform_and_save(df, output_path):
    # Apply initial uppercase and replace hyphens with spaces
    transformed_df = df.withColumn(
        "BrandCountryKey",
        upper(regexp_replace("BrandCountryKey", "-", " "))
    )

    # Apply specific replacements
    transformed_df = transformed_df.withColumn(
        "BrandCountryKey",
        when(col("BrandCountryKey").contains("SWEATYBETTY"),
             regexp_replace(col("BrandCountryKey"), "SWEATYBETTY", "SWEATY BETTY"))
        .when(col("BrandCountryKey") == "WWW META SHOPPING", "Meta US")
        .when(col("BrandCountryKey") == "WOLVERINE4WORK", "Hytest-B2B2C US")
        .when(col("BrandCountryKey") == "MERRELL EMEA EMERGING", "MERRELL IE")
        .when(col("BrandCountryKey") == "SAUCONY EMEA EMERGING", "SAUCONY IE")
        .otherwise(col("BrandCountryKey"))
    )

    # Overwrite and save as Delta
    transformed_df.write.format("delta").mode("overwrite").save(output_path)

    print(f"Data successfully written to {output_path}")


# Overwrite SB data

In [13]:
print(gold_delta_table_path_sb)

In [14]:
transform_and_save(df_sb, gold_delta_table_path_sb)

# Overwrite Wolverine Data

In [15]:
print(gold_delta_table_path_wolverine_API)

In [16]:
transform_and_save(www_best.dropDuplicates(), gold_delta_table_path_wolverine_API)