## Imports

In [None]:
from azure.storage.blob import BlobServiceClient
from notebookutils import mssparkutils
import urllib.parse
import re
import pandas as pd
from pyspark.sql.functions import col, to_date, coalesce, regexp_replace, date_add, current_date, last_day, expr, lit, unbase64,col, hex, split, lead, when, min as min_, max, translate, desc

from datetime import timedelta, datetime
from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable
import os

### includes

In [None]:
%run /utils/common_functions

### Set Configuration

In [None]:
# SharePoint API Details
tenant_id = "deace5d6-717b-4f79-ab12-6357206c0c36"

match = re.search(r'@([^.]+)\.dfs\.core\.windows\.net', raw_adls_path)
storage_account = match.group(1) if match else None
print(f"storage_account: {storage_account}")

if "prodprd" in storage_account:
    environment = 'production'
else:
    environment = 'development'

print(f"environment: {environment}")

### Set Parameters

In [None]:
start_date = '2025-06-01'
full_refresh_table = 0
##full_refresh_export = 1
filter_date = (datetime.strptime(start_date, "%Y-%m-%d") - timedelta(days=40)).strftime("%Y-%m-%d")
print(f"filter_date: {filter_date}")

## Load Delta Table

### Get Source Data

In [None]:
source_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/CustomerSegmentation/CustomerSegmentationHistory/period=weekly/"
target_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/CustomerSegmentation/mparticle_egress/"

segmentation_df = spark.read.format("delta").load(source_path).filter(col("Segmentation_date") >= filter_date )
print(f"source_path: {source_path}")

segmentation_df.createOrReplaceTempView("segmentation")
##sales_df.show(2)

### Incremental Logic

In [None]:
max_date = None
if DeltaTable.isDeltaTable(spark, target_path):
    target_df = spark.read.format("delta").load(target_path)
    max_date = target_df.agg(max("Segmentation_date")).collect()[0][0]


if max_date is not None and full_refresh_table !=1 :
    extract_date = max_date
else:
    extract_date = start_date

print(f"start_date: {start_date}")
print(f"max_date: {max_date}")
print(f"extract_date: {extract_date}")

In [None]:
segmentation_df = spark.sql("""
        with cte as 
        (   select *, 
                lag(customermetricshashdiff) over(partition by MparticleUserKey order by Segmentation_Date) as Prev
            from source 
        )
        select *
        from cte where customermetricshashdiff <> coalesce(Prev,'x')
""")

In [None]:
mparticle_egress = spark.sql(f"""
with cust as 
(
SELECT *
FROM (
    SELECT
        SEGMENTATION_DATE,
        PERIOD,
        customermetricshashdiff,
        LAG(customermetricshashdiff) OVER (
            PARTITION BY PERIOD, MparticleUserKey
            ORDER BY SEGMENTATION_DATE
        ) AS PREV,
        *
    FROM segmentation    
    where Segmentation_Date >= date('{extract_date}')
) tmp
WHERE (tmp.customermetricshashdiff IS NULL AND tmp.PREV IS NOT NULL)
   OR (tmp.customermetricshashdiff IS NOT NULL AND tmp.PREV IS NULL)
   OR (tmp.customermetricshashdiff != tmp.PREV)
ORDER BY SEGMENTATION_DATE DESC
)
select 
    Segmentation_date,
    MparticleUserKey as mpid,
    DerivedCustomerType as CUSTOMER_TYPE,
    lifecycle_stage as LIFECYCLE_STAGE,
    NewCustomerAcquiredFPorSale as NEW_CUSTOMER_ACQUIRED_FULL_PRICE_OR_SALE,
    CustomerLifeTimeValueGBP as CUSTOMER_LIFETIME_VALUE,
    NetLifeTimeValueGBP as NET_LIFETIME_VALUE,
    TotalOrdercount as TOTAL_ORDER_COUNT,
    AverageOrderValueGBP as AVERAGE_ORDER_VALUE,
    average_order_items as AVERAGE_ORDER_ITEMS,
    PercetageStoreOrders as PERCENTAGE_STORE_ORDERS,
    PercentageDiscountItems as PERCENTAGE_DISCOUNT_ITEMS,
    totalreturnamountGBP as TOTAL_RETURN_AMOUNT,
    FirstOrderSource as FIRST_ORDER_SOURCE,
    FirstOrderDate as FIRST_ORDER_DATE,
    FirstordervalueGBP as FIRST_ORDER_VALUE,
    FirstOrderItems as FIRST_ORDER_ITEMS,
    LastOrderSource as LAST_ORDER_SOURCE,
    LastOrderDate as LAST_ORDER_DATE,
    Lastordervalue as LAST_ORDER_VALUE,
    LastOrderItems as LAST_ORDER_ITEMS,
    FavouriteSource as FAVOURITE_SOURCE,
    FavouriteCategory as FAVOURITE_CATEGORY,
    FavouriteSubCategory as FAVOURITE_SUBCATEGORY,
    FavouriteColor as FAVOURITE_COLOUR,
    FavouriteColourType as FAVOURITE_COLOUR_TYPE,
    FavouriteHero as FAVOURITE_HERO,
    FavouriteLeggingSize as FAVOURITE_LEGGING_SIZE,
    FavouriteStore as FAVOURITE_STORE,
    CUSTOMER_FIRST_ORDER_PRICE_TYPE_V2,
    CUSTOMER_12M_ROLLING_PRICETYPE,
    '{environment}' as environment
from cust 
""")
##mparticle_egress.show(15,truncate=False)

### Write into Delta table

In [None]:
mparticle_egress.write.format("delta").mode("append").option("mergeSchema", "true").partitionBy("Segmentation_date").save(target_path)

## Exit

In [None]:
mssparkutils.notebook.exit("0")

## Miscellaneous ad hoc code cells

In [None]:
mparticle_egress.createOrReplaceTempView("mparticle_egress")
result_df = spark.sql("""
        select 
            Segmentation_date,
            count(*)
        from mparticle_egress
        group by Segmentation_date
        order by Segmentation_date desc
""")
#result_df.show(40)

In [None]:
result_df = spark.sql("""
        select instr('{storage_account}','proddev')
""")
result_df.show(40)

In [None]:
if "noproddev" in storage_account:
    print("Substring exists")
else:
    print("Substring does not exist")

## Export Prep

In [None]:
from datetime import datetime
timestamp_str = datetime.now().strftime('%Y%m%d%H%M%S')  # â†’ e.g. '20250711120802'

file_name = f"Synapse_Profile_Updates_{timestamp_str}-eventless.csv"
print(f"file_name: {file_name}")
export_path = f"abfss://export@{storage_account}.dfs.core.windows.net/mParticle/{file_name}"
print(f"export_path: {export_path}")

## Generate CSV

In [None]:
mparticle_egress.coalesce(1).write.option("header", "true").csv(export_path)