In [1]:
#Read new traffic data via API, store in raw, process as Delta table
from delta import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, LongType, BinaryType
import datetime
from datetime import timedelta
import requests, json
from azure.storage.blob import BlobServiceClient
import base64
import delta

In [8]:
%run /utils/common_functions

### Get the Auth Bearer token

In [3]:
# Obtain the auth token using Client Credntials and OAuth 2
# https://developer.salesforce.com/docs/commerce/b2c-commerce/references/b2c-commerce-ocapi/oauth.html
# Making this a function since we would need to call it to get a new token if our orig token expires (3600s)

def get_access_token():
    try:
        # Only one token endpoint for all environments
        token_url = 'https://account.demandware.com/dwsso/oauth2/access_token'
        # Using a client credentials grant with the clientId=username and clientSecret=password. (The word doc in LastPass says "Client Credentials")
        # Convert the username and password, combine with a colon, and convert into a base64 encoded value.
        username = '498a02fc-e957-4bfa-bbed-3c771d9a9c60'
        password = mssparkutils.credentials.getSecret(kv_name, 'emwbis-client-secret', 'ls_kv_adap')
        client_credentials = f"{username}:{password}"
        creds_base64 = base64.b64encode(client_credentials.encode("utf-8")).decode("utf-8")

        headers = {
            'Host': 'account.demandware.com',
            'Authorization': f"Basic {creds_base64}",
            'Content-Type': 'application/x-www-form-urlencoded'
        }

        token_body = {
            'grant_type': 'client_credentials'
        }

        response = requests.post(url=token_url, headers=headers, data=token_body) 
        response_json = json.loads(response.text)
        #print(response_json) 
        access_token = response_json["access_token"] 
        print('Acquired access token', access_token[:20],'...from',token_url)

        return access_token
    except Exception as e:
        print("error=",str(e))
        raise Exception("Could not obtain Bearer token.")

### Make the API Call

In [4]:
# Make a Body template
body = {
    "query": {
        "bool_query": {
            "must": [
                {
                    "term_query": {
                        "fields": [
                            "site_id"
                        ],
                        "operator": "is",
                        "values": [
                            "<SiteID>"
                        ]
                    }
                },
                {
                    "term_query": {
                        "fields": [
                            "last_modified"
                        ],
                        "operator": "greater",
                        "values": [
                            "<LastModified>"
                        ]
                    }
                }
            ]
        }
    },
    "count": 200,
    "start": 50,
    "select": "(**)"
}

In [5]:
# A helper function (from the orig source)
def format_emwbis_timestamp(dt: datetime.datetime) -> str:
    """
    Formats a datetime for use with the EMWBIS API
    Args:
        dt: a datetime object

    Returns:
        formatted timestamp string
    """
    return f"{dt.isoformat(timespec='milliseconds')}Z"

In [6]:
#Make API calls 

# Get a bearer token
access_token = get_access_token()

headers = {
    "Authorization": f"Bearer {access_token}", 
    "Content-Type": "application/json"
}

# The API URL
if env_var == env_dict['dev']:
    api_url = 'https://development.sweatybetty.com/s/-/dw/data/v22_6/custom_objects_search/EmailMeWhenAvailable'
    # For prod testing in dev (comment out when done testing)
    # Note that one test had about 800 items total, so testing prod data in dev appears to be harmless.
    # api_url = 'https://www.sweatybetty.com/s/-/dw/data/v22_6/custom_objects_search/EmailMeWhenAvailable'
elif env_var in [env_dict['test'], env_dict['prod']]:
    api_url = 'https://www.sweatybetty.com/s/-/dw/data/v22_6/custom_objects_search/EmailMeWhenAvailable'
else:
    raise Exception('Failed to set API url.')

# We will make paginated API calls (max 200 items at a time) for each siteID in our list of valid Site IDs. Valid site IDs are SB, SB-EU, SB-AU, SB-US
# This code handles pagination and the potential for an access token expiration.

# For now, the last modified timestamp will look back two days. May consider a watermark store design down the road.
two_days_ago_midnight = (datetime.datetime.now() - datetime.timedelta(days=2)).replace(hour=0, minute=0, second=0, microsecond=0)
# Format it as An ISO 8601 datetime string format
# Example: last_modifed = '2025-02-12T00:00:00Z'
last_modifed = format_emwbis_timestamp(two_days_ago_midnight)
 

# We concatenate ALL sites results into a single list var, results
results = []

sites = ['SB', 'SB-EU', 'SB-AU', 'SB-US']
for site in sites:
    print(f"processing site: {site}")

    # Replace API Body Parameters
    # <SiteID>
    # Element 0 of query.bool_query.must
    body["query"]["bool_query"]["must"][0]["term_query"]["values"] = [site]
    # <LastModified> - e.g. 2022-07-29T13:12:11Z (IOS 8601)
    # Element 1 for query.bool_query.must
    body["query"]["bool_query"]["must"][1]["term_query"]["values"] = [last_modifed]
    # <Start> and <Count> for pagination? Start with 0 and Count 200, then start with 200
    # https://developer.salesforce.com/docs/commerce/b2c-commerce/references/b2c-commerce-ocapi/pagination.html
    page_item_count = 200
    body["count"] = page_item_count
    page_start_value = 0
    body["start"] = page_start_value

    print(body)

    # Now get chunks of 200 rows at a time.
    # Loop to get more pages until we have no more to process
    while True:
        print(f"Requesting next {page_item_count} items starting at index {page_start_value} from {api_url}")
        try:
            # The API call
            #response = requests.post(url=api_url, data=body, headers=headers)
            response = requests.post(url=api_url, data=json.dumps(body), headers=headers)

            # Handle various response status_codes
            # Definitiely need to handle token expired error, get a new token and try again.
            status_code = response.status_code
            print(f"response status_code: {status_code}")
            if status_code < 200:
                # 100s - request received, but no response
                print("status 100s")
                raise Exception(f"Unhandled response.status_code: {status_code}")
            elif status_code < 300:
                # 200s - request successfully processed (could be no items returned, tho?).  Fo
                print("status 200s")
                # We want to just fall through to the next lines of code.
            elif status_code < 400:
                # 300s - resource is in a different location - redirect
                print("status 300s")
                raise Exception(f"Unhandled response.status_code: {status_code}")
            elif status_code < 500:
                # 400s - client error
                print("status 400s")
                # If it's a 401, try to get a new access token and try this page again.
                if status_code == 401:
                    access_token = get_access_token()
                    headers = {
                        "Authorization": f"Bearer {access_token}", 
                        "Content-Type": "application/json"
                    }
                    continue
                else:
                    print(response.text)
                    raise Exception(f"Unhandled client side error status code {status_code} received.")
            else:
                # 500s - server side error
                print("status 500s")
                print(response.text)
                raise Exception('500s Server Side error status code received')

            response_data = response.json()
            response_item_count = response_data['count']
            response_item_total = response_data['total']
            
            print(f"response_data count: {response_data['count']}")
            print(f"response_data start: {response_data['start']}")
            print(f"response_data total: {response_data['total']}")

            #print(response.text)
            
            # Concatenate/Collect each page's hits collection/array/list
            # We have PII here - customer email address (c_emailID) - Chad says to just ingest into raw and he will lock down the folder.
            # results += response_data.get("hits", []) # ORIGINAL COMMAND
            response_data_hits = response_data.get("hits", [])

            for hit in response_data_hits:
                hit["site_id"] = site
            
            results += response_data_hits

            # See if we can end the processing for this site.
            # This assumes that response_item_count is the count of items in just this most recent call
            if response_item_count < page_item_count:
                # Then we didn't even have a full page of hits items - we can exit the while
                break
            else:
                # we had a full page. Set the new page start position and keep collecting pages, 200 at a time.
                page_start_value += page_item_count
                body["start"] = page_start_value

        except:
            print("error=", str(e))
            raise Exception('Could not obtain API data.')
            # Break out of processing this site
            break

    # end while *********************************
# end for each site
print("-----------------------------------------------")
print(f"results count: {len(results)}")
print(f"SUCCESS")

### Save to raw zone in Data Lake

In [95]:
# #Save formatted_response as raw file
account_key = mssparkutils.credentials.getSecret(kv_name, 'storage-key', 'ls_kv_adap')
blob_service_client = BlobServiceClient(account_url=blob_adls_path, credential=account_key)
container_client = blob_service_client.get_container_client("raw")
fileprefix = datetime.datetime.now().strftime("%Y/%m/%d/%H%M%S")
# blob_name = f"StoreTech/Traffic/{fileprefix}.json"
blob_name = f"EMWBIS/{fileprefix}.json"
blob_client = container_client.get_blob_client(blob_name)
# Take our results from previous cell and make it JSON for the BLOB upload
formatted_response = json.dumps(results)
blob_client.upload_blob(formatted_response, overwrite=True)

#### **Copy Raw to Bronze**

In [9]:
# Read EMWBIS from raw container
# FULL LOAD
# emwbis_raw_df = spark.read.json(f'{raw_adls_path}EMWBIS/*/*/*/*.json') 

# DAILY LOAD
current_date = datetime.datetime.now().strftime("%Y/%m/%d")
current_date_path = f"{raw_adls_path}EMWBIS/{current_date}/*.json"
emwbis_raw_df = spark.read.json(current_date_path)

# # # TEST DATE
# current_date = (datetime.datetime.now() - timedelta(days=5)).strftime("%Y/%m/%d")
# current_date_path = f"{raw_adls_path}EMWBIS/{current_date}/*.json"
# emwbis_raw_df = spark.read.json(current_date_path)

if emwbis_raw_df.count() == 0:
    print("emwbis_raw_df has no rows. Ending notebook run.")
    notebookutils.mssparkutils.notebook.exit(0)

In [10]:
from pyspark.sql.types import StructType, StructField, StringType, BinaryType, DateType, TimestampType, DecimalType, BooleanType


# Include File Path
emwbis_raw_df = emwbis_raw_df.withColumn("file_path", input_file_name()).distinct()

# Extract the date part from the file path
emwbis_raw_df = emwbis_raw_df.withColumn("date_part", regexp_extract("file_path", r'/(\d{4}/\d{2}/\d{2})/', 1))

# Extract the time part from the file name
emwbis_raw_df = emwbis_raw_df.withColumn("time_part", regexp_extract("file_path", r'/(\d{6})\.json$', 1))

# Combine the date and time parts into a single string
emwbis_raw_df = emwbis_raw_df.withColumn("datetime_string", concat_ws(" ", "date_part", "time_part"))

# Convert the combined string to a timestamp
emwbis_raw_df = emwbis_raw_df.withColumn("timestamp", to_timestamp("datetime_string", "yyyy/MM/dd HHmmss"))


# Rearrange fields and rename to PascalCase format
# Check if 'site_id' column exists in the source DataFrame
columns = emwbis_raw_df.columns
if 'site_id' not in columns:
    emwbis_raw_df = emwbis_raw_df.withColumn('site_id', lit(''))

# Select and cast columns
emwbis_df = emwbis_raw_df.select(
    col('_resource_state').alias('ResourceState').cast(StringType()),
    col('c_emailID').alias('CEmailId').cast(StringType()),
    col('object_type').alias('ObjectType').cast(StringType()),
    col('c_productAvailabilityDate').alias('CProductAvailabilityDate').cast(DateType()),
    col('c_locale').alias('CLocale').cast(StringType()),
    col('last_modified').alias('LastModified').cast(TimestampType()),
    col('creation_date').alias('CreationDate').cast(TimestampType()),
    col('_type').alias('Type').cast(StringType()),
    col('c_productID').alias('CProductId').cast(StringType()),
    col('key_property').alias('KeyProperty').cast(StringType()),
    col('key_value_string').alias('KeyValueString').cast(StringType()),
    col('site_id').alias('SiteId').cast(StringType()),
    col("timestamp").alias('IngestedAt').cast(TimestampType()),
    col('file_path').alias('IngestedFrom').cast(StringType()),
    col('c_selectedCountry').alias('CSelectedCountry').cast(StringType())
)

In [11]:
#Write dataframe to bronze
if delta.DeltaTable.isDeltaTable(spark, f'{bronze_adls_path}/EMWBIS/DQ_EMWBIS/'):
    deltaTable = delta.DeltaTable.forPath(spark, f'{bronze_adls_path}/EMWBIS/DQ_EMWBIS/')
    
    # Find new rows that are not in the existing table
    # existing_df = deltaTable.toDF()
    # new_rows_df = emwbis_df.exceptAll(existing_df)

    max_ingest_date = deltaTable.toDF().agg({"IngestedAt": "max"}).collect()[0][0]
    new_rows_df = emwbis_df.filter(col("IngestedAt") > max_ingest_date)

    # Append only new rows to the Delta table
    new_rows_df.write.format("delta").mode("append").save(f'{bronze_adls_path}/EMWBIS/DQ_EMWBIS/')
    print('Merged delta table.')

else:
    emwbis_df.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{bronze_adls_path}/EMWBIS/DQ_EMWBIS/')
    print('Created delta table.')

## Bronze to Silver

In [12]:
# Read EMWBIS from bronze container
dq_emwbis = delta.DeltaTable.forPath(spark, f'{bronze_adls_path}/EMWBIS/DQ_EMWBIS/').toDF()

In [13]:
# Create Link
# dv_prep__email_with_customer_key not yet in place. using CEmailId for now as replacement to CustomerKey
# IS_BC_CUSTOMER_KEY, IS_OPTIMOVE_CUSTOMER_KEY not available

dv_prep_emwbis_lnk = (
            dq_emwbis.select(dq_emwbis["CEmailId"].alias("CustomerKey"),
                              dq_emwbis["CProductId"].alias("ItemBarcodeKey"),
                              "IngestedAt",
                              "LastModified")
            .withColumn("LoadDateTime", current_timestamp())
            .withColumn("RecordSource", lit('EMWBIS'))
            )

# Hash CustomerKey+ItemBarcodeKey , CustomerKey, ItemBarcodeKey
dv_stg_emwbis_lnk = (
                        dv_prep_emwbis_lnk
                        .withColumn("LnkEmwbisHkey",
                              md5(
                                    when(
                                    (trim(upper(col("CustomerKey"))) != '') | (trim(upper(col("ItemBarcodeKey"))) != ''),
                                    concat_ws(
                                          '||',
                                          when(trim(upper(col("CustomerKey"))) != '', trim(upper(col("CustomerKey")))).otherwise('^^'),
                                          when(trim(upper(col("ItemBarcodeKey"))) != '', trim(upper(col("ItemBarcodeKey")))).otherwise('^^')
                                    )
                                    ).otherwise(lit('0000000000000000'))
                              ).cast(BinaryType())
                        )
                        .withColumn("CustomerHkey",
                              md5(
                                    when(
                                    trim(upper(col("CustomerKey"))) != '',
                                    trim(upper(col("CustomerKey")))
                                    ).otherwise(lit('0000000000000000'))
                              ).cast(BinaryType())
                        )
                        .withColumn("ItemBarcodeHkey",
                              md5(
                                    when(
                                    trim(upper(col("ItemBarcodeKey"))) != '',
                                    trim(upper(col("ItemBarcodeKey")))
                                    ).otherwise(lit('0000000000000000'))
                              ).cast(BinaryType())
                        )
)

# Derive Valid Records for Silver
# Valid Records = NEW + earliest LnkEmwbisHkey based on LoadDateTime and RecordSource + CustomerHkey is not null + ItemBarcodeKey is not null

from pyspark.sql import Window
window_spec1 = Window.partitionBy("LnkEmwbisHkey").orderBy("IngestedAt", "LastModified") # orderBy LoadDateTime replaced to orderBy IngestedAt+LastModified as dv_stg_emwbis_lnk is not written as table
window_spec2 = Window.partitionBy("LnkEmwbisHkey").orderBy("IngestedAt", "LastModified", "RecordSource") # orderBy LoadDateTime replaced to orderBy IngestedAt+LastModified as dv_stg_emwbis_lnk is not written as table

dv_rdv_link_emwbis = (dv_stg_emwbis_lnk.select(
                                    "LnkEmwbisHkey",
                                    "CustomerHkey",
                                    "ItemBarcodeHkey",
                                    "CustomerKey",
                                    "ItemBarcodeKey",
                                    "LoadDateTime",
                                    "IngestedAt",
                                    "LastModified",
                                    "RecordSource")
                                    .withColumn("RowNumber", row_number().over(window_spec1))
                                    .filter(col("RowNumber") == 1)
                                    .drop("RowNumber")
                                    .filter((col("CustomerHkey").isNotNull()) & (col("ItemBarcodeHkey").isNotNull()))
                                    .withColumn("RowNumber", row_number().over(window_spec2))
                                    .filter(col("RowNumber") == 1)
                                    .drop("RowNumber")
)




In [16]:
# Write dataframe to silver
if DeltaTable.isDeltaTable(spark, f'{silver_adls_path}/EMWBIS/LINK_EMWBIS/'):
    deltaTable = DeltaTable.forPath(spark, f'{silver_adls_path}/EMWBIS/LINK_EMWBIS/')

    deltaTable.alias("base").merge(
        source = dv_rdv_link_emwbis.alias("updates"),
        condition = "base.LnkEmwbisHkey = updates.LnkEmwbisHkey"
    ).whenNotMatchedInsertAll().execute()
    print('Merged delta table.')
else:
    dv_rdv_link_emwbis.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{silver_adls_path}/EMWBIS/LINK_EMWBIS/')
    print('Created delta table.')

In [17]:
# Create SAT
# dv_prep__email_with_customer_key not yet in place. using CEmailId for now as replacement to CustomerKey
# IS_BC_CUSTOMER_KEY, IS_OPTIMOVE_CUSTOMER_KEY not available

dv_prep_emwbis_sat = (
    dq_emwbis.withColumn("CustomerKey", col("CEmailId"))
    .withColumn("ItemBarcodeKey", col("CProductId"))
    .withColumn("IsDeleted", lit(False))
    .withColumn("LoadDateTime", current_timestamp())
    .withColumn("EffectiveFrom", col("LastModified"))
    .withColumn("RecordSource", lit('EMWBIS'))
)

# Hash CustomerKey+ItemBarcodeKey , CustomerKey, ItemBarcodeKey, EmailMeWhenBackInStockHashdiff
dv_stg_emwbis_sat = (
                        dv_prep_emwbis_sat
                        .withColumn("LnkEmwbisHkey",
                              md5(
                                    when(
                                    (trim(upper(col("CustomerKey"))) != '') | (trim(upper(col("ItemBarcodeKey"))) != ''),
                                    concat_ws(
                                          '||',
                                          when(trim(upper(col("CustomerKey"))) != '', trim(upper(col("CustomerKey")))).otherwise('^^'),
                                          when(trim(upper(col("ItemBarcodeKey"))) != '', trim(upper(col("ItemBarcodeKey")))).otherwise('^^')
                                    )
                                    ).otherwise(lit('0000000000000000'))
                              ).cast(BinaryType())
                        )
                        .withColumn("CustomerHkey",
                              md5(
                                    when(
                                    trim(upper(col("CustomerKey"))) != '',
                                    trim(upper(col("CustomerKey")))
                                    ).otherwise(lit('0000000000000000'))
                              ).cast(BinaryType())
                        )
                        .withColumn("ItemBarcodeHkey",
                              md5(
                                    when(
                                    trim(upper(col("ItemBarcodeKey"))) != '',
                                    trim(upper(col("ItemBarcodeKey")))
                                    ).otherwise(lit('0000000000000000'))
                              ).cast(BinaryType())
                        )
                        .withColumn("EmailMeWhenBackInStockHashdiff",
                            md5(
                                concat_ws(
                                    '||',
                                    when(trim(upper(col("ResourceState"))) != '', trim(upper(col("ResourceState")))).otherwise('^^'),
                                    when(trim(upper(col("Type"))) != '', trim(upper(col("Type")))).otherwise('^^'),
                                    when(trim(upper(col("CEmailId"))) != '', trim(upper(col("CEmailId")))).otherwise('^^'),
                                    when(trim(upper(col("CLocale"))) != '', trim(upper(col("CLocale")))).otherwise('^^'),
                                    when(trim(upper(col("CProductAvailabilityDate"))) != '', trim(upper(col("CProductAvailabilityDate")))).otherwise('^^'),
                                    when(trim(upper(col("CProductId"))) != '', trim(upper(col("CProductId")))).otherwise('^^'),
                                    when(trim(upper(col("CSelectedCountry"))) != '', trim(upper(col("CSelectedCountry")))).otherwise('^^'),
                                    when(trim(upper(col("CreationDate"))) != '', trim(upper(col("CreationDate")))).otherwise('^^'),
                                    when(trim(upper(col("CustomerKey"))) != '', trim(upper(col("CustomerKey")))).otherwise('^^'),
                                    when(trim(upper(col("IngestedFrom"))) != '', trim(upper(col("IngestedFrom")))).otherwise('^^'),
                                    when(trim(upper(col("IsDeleted"))) != '', trim(upper(col("IsDeleted")))).otherwise('^^'),
                                    when(trim(upper(col("ItemBarcodeKey"))) != '', trim(upper(col("ItemBarcodeKey")))).otherwise('^^'),
                                    when(trim(upper(col("KeyProperty"))) != '', trim(upper(col("KeyProperty")))).otherwise('^^'),
                                    when(trim(upper(col("KeyValueString"))) != '', trim(upper(col("KeyValueString")))).otherwise('^^'),
                                    when(trim(upper(col("LastModified"))) != '', trim(upper(col("LastModified")))).otherwise('^^'),
                                    when(trim(upper(col("ObjectType"))) != '', trim(upper(col("ObjectType")))).otherwise('^^'),
                                    when(trim(upper(col("RecordSource"))) != '', trim(upper(col("RecordSource")))).otherwise('^^'),
                                    when(trim(upper(col("SiteId"))) != '', trim(upper(col("SiteId")))).otherwise('^^')
                                )
                            )
                        )
)





In [24]:
# Write Valid Records to Silver
# Valid Records = NEW + with update base on EmailMeWhenBackInStockHashdiff

from pyspark.sql import Window
window_spec1 = Window.partitionBy("LnkEmwbisHkey").orderBy(desc("LoadDateTime"), desc("IngestedAt"), desc("LastModified")) # orderBy LoadDateTime replaced to orderBy LoadDateTime+IngestedAt+LastModified as dv_stg_emwbis_sat is not written as table

dv_rdv_sat_emwbis = dv_stg_emwbis_sat.filter(col("LnkEmwbisHkey").isNotNull()).drop("CustomerHkey", "ItemBarcodeHkey").distinct()
dv_rdv_sat_emwbis_LnkEmwbisHkey = dv_rdv_sat_emwbis.select(col("LnkEmwbisHkey").alias("LnkEmwbisHkey2")).distinct()

# Write dataframe to silver
if DeltaTable.isDeltaTable(spark, f'{silver_adls_path}/EMWBIS/SAT_EMWBIS'):
    deltaTable = DeltaTable.forPath(spark, f'{silver_adls_path}/EMWBIS/SAT_EMWBIS')

    deltaTable_df = deltaTable.toDF()
    deltaTable_df.select("LnkEmwbisHkey", "EmailMeWhenBackInStockHashdiff", "LoadDateTime", "IngestedAt", "LastModified").show(truncate=False)
 

    latest_records = (deltaTable_df.select("LnkEmwbisHkey", "EmailMeWhenBackInStockHashdiff", "LoadDateTime", "IngestedAt", "LastModified").distinct()
                                .join(dv_rdv_sat_emwbis_LnkEmwbisHkey, deltaTable_df.LnkEmwbisHkey == dv_rdv_sat_emwbis_LnkEmwbisHkey.LnkEmwbisHkey2, "inner")
                                .withColumn("RowNumber", row_number().over(window_spec1))
                                .filter(col("RowNumber") == 1)
                                .drop("RowNumber", "LnkEmwbisHkey2")
    )
    latest_records.select("LnkEmwbisHkey", "EmailMeWhenBackInStockHashdiff", "LoadDateTime", "IngestedAt", "LastModified").show(truncate=False)

    new_records = dv_rdv_sat_emwbis.join(
                                        latest_records,
                                        (latest_records.LnkEmwbisHkey == dv_rdv_sat_emwbis.LnkEmwbisHkey),
                                        "left"
                                    ).filter(
                                        ((latest_records.EmailMeWhenBackInStockHashdiff != dv_rdv_sat_emwbis.EmailMeWhenBackInStockHashdiff) |
                                        (latest_records.EmailMeWhenBackInStockHashdiff.isNull()))
                                        & ((latest_records.IngestedAt < dv_rdv_sat_emwbis.IngestedAt) |
                                        (latest_records.IngestedAt.isNull()))
                                    ).select(dv_rdv_sat_emwbis["*"])

    new_records.select("LnkEmwbisHkey", "EmailMeWhenBackInStockHashdiff", "LoadDateTime", "IngestedAt", "LastModified").show(truncate=False)


    new_records.write.format("delta").mode("append").save(f'{silver_adls_path}/EMWBIS/SAT_EMWBIS/')
    print('Merged delta table.')
else:
    dv_rdv_sat_emwbis.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{silver_adls_path}/EMWBIS/SAT_EMWBIS')
    print('Created delta table.')

## Silver to Gold

In [19]:
sat_emwbis = DeltaTable.forPath(spark, f'{silver_adls_path}/EMWBIS/SAT_EMWBIS/').toDF()

In [20]:
# Add Valid_ columns to create historical data
windowSpec = Window.partitionBy("LnkEmwbisHkey").orderBy(col("IngestedAt"), col("LastModified"))

sat_emwbis_hist = (sat_emwbis.withColumn("ValidFrom", col("IngestedAt"))
                            .withColumn("ValidTo", coalesce(lead("IngestedAt", 1).over(windowSpec), to_date(lit("9999-12-31"))))
                            .withColumn("ValidFlag", when(col("ValidTo") == to_date(lit("9999-12-31")), lit("Y")).otherwise(lit("N")))
)

# Filter ValidFlag = Y to get active data
sat_emwbis_act = sat_emwbis_hist.filter(col("ValidFlag") == "Y")

In [21]:
# Delete Insert to gold container
fact_emwbis = sat_emwbis_act.select(
    col("CreationDate"),
    col("LastModified"),
    col("SiteId"),
    col("CLocale"),
    col("ItemBarcodeKey"),
    col("CEmailId"),
    col("CProductAvailabilityDate"),
    col("CSelectedCountry"),
    col("Type"),
    col("ObjectType"),
    col("KeyProperty"),
    col("KeyValueString"),
    col("ResourceState"),
    col("IngestedFrom"),
    col("IngestedAt"),
    col("LoadDatetime")
).distinct()

delta_table_path = f"{gold_adls_path}/EMWBIS/FACT_EMWBIS/"
fact_emwbis.write.format("delta").option("overwriteSchema", "true").mode("overwrite").save(delta_table_path)

In [22]:
# output = DeltaTable.forPath(spark, f'{gold_adls_path}/EMWBIS/FACT_EMWBIS/').toDF()
# print(output.count())
# display(output)