# Customer Segmentation


## Imports

In [None]:
from azure.storage.blob import BlobServiceClient
from notebookutils import mssparkutils
import urllib.parse
import re
import pandas as pd
from pyspark.sql.functions import col, to_date, coalesce, regexp_replace, date_add, current_date, last_day, expr, lit, unbase64,col, hex, split, lead, when, min as min_, max, translate, desc

from datetime import timedelta
from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable
import os

### includes

In [None]:
%run /utils/common_functions

### Set Configuration and Get Secrets

In [None]:
# SharePoint API Details
tenant_id = "deace5d6-717b-4f79-ab12-6357206c0c36"

match = re.search(r'@([^.]+)\.dfs\.core\.windows\.net', raw_adls_path)
storage_account = match.group(1) if match else None
print(f"storage_account: {storage_account}")

### Set pipeline parameters

In [None]:
# Add 1 day to current date
tomorrow = date_add(current_date(), 1)
week_start_date = '2025-11-01'
month_start_date = '2025-09-01'
##ull_refresh = 1       ### set full_refresh = 1 to reload from Week_start_date and Month_start_date  ### not used anymore ###


## Get Source data

In [None]:
target_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/CustomerSegmentation/CustomerSegmentationHistory/"
cust_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/mParticle/CustomerMasterHistory/"
SalesUnion_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/SalesUnion/"
hub_path = f"abfss://raw@{storage_account}.dfs.core.windows.net/Snowflake/ANALYTICS_PROD/DV_RDV/HUB_CUSTOMER/*.parquet"
exchange_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/SAP/BW/FxRatesExtendedCalendarDay/"
custkey_path = f"abfss://silver@{storage_account}.dfs.core.windows.net/mParticle/CustomerKeyLookup/"
date_path = f"abfss://raw@{storage_account}.dfs.core.windows.net/Snowflake/ANALYTICS_PROD/ANALYTICS_DATA/DIM_CALENDAR/*.parquet"
material_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/SAP/BW/Material/"
hierarchy_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/SAP/BW/ProductHierarchyLevel2/"
site_path = f"abfss://gold@{storage_account}.dfs.core.windows.net/SAP/BW/Site/"
upc_path = f"abfss://raw@{storage_account}.dfs.core.windows.net/SAP/BW/ZUPC/"

ex_df = spark.read.format("delta").load(exchange_path)
hub_df = spark.read.format("parquet").load(hub_path)
cust_df = spark.read.format("parquet").load(cust_path)
custkey_df = spark.read.format("delta").load(custkey_path)
sales_df = spark.read.format("parquet").load(SalesUnion_path)
date_df = spark.read.format("parquet").load(date_path)
material_df = spark.read.format("delta").load(material_path)
prod_hierarchy_df = spark.read.format("delta").load(hierarchy_path)
site_df = spark.read.format("delta").load(site_path)
upc_df = spark.read.format("parquet").load(upc_path)

print(f"exchange_path: {exchange_path}")
print(f"hub_path: {hub_path}")
print(f"cust_path: {cust_path}")
print(f"custkey_path: {custkey_path}")
print(f"SalesUnion_path: {SalesUnion_path}")
print(f"target_path: {target_path}")

sales_df.createOrReplaceTempView("salesunion")
site_df.createOrReplaceTempView("site")
##sales_df.show(2)

## Prep

### Get incremental Dates

In [None]:
max_week = None
max_month = None

if DeltaTable.isDeltaTable(spark, target_path):
    target_df = spark.read.format("delta").load(target_path)
    max_month = target_df.filter(col("period") == "monthly").agg(max("Segmentation_date")).collect()[0][0]
    max_week = target_df.filter(col("period") == "weekly").agg(max("Segmentation_date")).collect()[0][0]

print(f"max_month: {max_month}")
print(f"max_week: {max_week}")

### Dates

In [None]:
date_df = date_df.select(col("calendar_date").alias("calendar_date"))

# month conditions 
month_conditions = (
    (col("calendar_date") == last_day(col("calendar_date"))) &
    (col("calendar_date") <= tomorrow) &
    (col("calendar_date") >= month_start_date)
)

if max_month is not None:
    month_conditions = month_conditions & (col("calendar_date") > max_month)

# Monthly period
monthly_df2 = date_df.filter(month_conditions).withColumn("segmentation_date", col("calendar_date")) \
 .withColumn("period", lit("monthly"))

monthly_df = monthly_df2.orderBy(col("segmentation_date")).limit(12)

# Week conditions 
Week_conditions = (
    (col("weekday_name") == "Mon") &
    (col("calendar_date") <= tomorrow) &
    (col("calendar_date") >= week_start_date)
)

if max_week is not None:
    Week_conditions = Week_conditions & (col("calendar_date") > max_week)


# Weekly period
weekly_df2 = date_df.filter(Week_conditions).withColumn("segmentation_date", col("calendar_date")) \
 .withColumn("period", lit("weekly"))

weekly_df = weekly_df2.orderBy(col("segmentation_date")).limit(52)


# Combine both
segmentation_periods_df = monthly_df.unionByName(weekly_df)
segmentation_periods_df.createOrReplaceTempView("segmentation_periods")

### Materials

In [None]:
ph1_df = prod_hierarchy_df.alias("ph1")
ph2_df = prod_hierarchy_df.alias("ph2")
ph3_df = prod_hierarchy_df.alias("ph3")

material2_df = (
    material_df.alias("main")
    .join(ph1_df, F.col("main.ProductHierarchylevel1") == F.col("ph1.PROD_HIER"), "left")
    .join(ph2_df, F.col("main.ProductHierarchylevel2") == F.col("ph2.PROD_HIER"), "left")
    .join(ph3_df, F.col("main.ProductHierarchylevel3") == F.col("ph3.PROD_HIER"), "left")
    .select(
        "main.*",
        F.col("ph1.TXTMD").alias("ProductHierarchy1Text"),
        F.col("ph2.TXTMD").alias("ProductHierarchy2Text"),
        F.col("ph3.TXTMD").alias("ProductHierarchy3Text")
    )
)

material2_df.createOrReplaceTempView("material")

### upc 

material3_df = (
    material2_df.alias("m")
    .join(upc_df,F.col("m.Material") == F.col("ZMATNUM"),"left")
    .select("m.*",
        F.col("AF_GRDVAL").alias("gridvalue"),
        F.col("ZUPC").alias("upc")  
    )
    .distinct()

)
material3_df.createOrReplaceTempView("upc")


### Customer

In [None]:
cust_df.createOrReplaceTempView("cust")

customer_df = spark.sql("""
with c as 
(
    select mparticleuserkey,
    GlobalCustomerKey,
    Email,
    case when CustomerType = 0 then 'Registered'
        when CustomerType = 1 then 'New'
        when CustomerType = 2 then 'Returning'
        end as CustomerType,
    EffectiveFrom,
    (validfrom) as validfrom,
    row_number() over(partition by mparticleuserkey, (validfrom) order by validto) R
    from cust

)
SELECT  distinct
  mparticleuserkey,
  GlobalCustomerKey,
  Email,
  CustomerType,
  EffectiveFrom as cust_EffectiveFrom,
  CASE 
    WHEN validfrom = MIN(validfrom) OVER (PARTITION BY mparticleuserkey) THEN ('1900-01-01')
    ELSE validfrom
  END AS validfrom,
  COALESCE(
    LEAD(validfrom) OVER (
      PARTITION BY mparticleuserkey
      ORDER BY validfrom, EffectiveFrom
    ), 
    TO_TIMESTAMP('9999-12-31')
  ) AS validto
FROM c
where r = 1

""")

customer_df.createOrReplaceTempView("customers")


### Order Rank

In [None]:
order_rank_df = spark.sql("""
WITH sales AS (
    SELECT 
        s.* ,
        CASE WHEN coalesce(IsReturn,0) = 0 THEN 'Sales' ELSE 'Returns' END AS transaction_type,
        m.LongDescription as ProductDescription,
        m.colorDescription as Color,
        m.ProductHierarchy2Text as Category,
        m.ProductHierarchy3Text as SubCategory,
        i.MediumDescription as StoreName
    FROM salesunion s 
    left join material m on s.product = m.OldMaterialNumber
    left join site i on i.site = s.store
),
orders AS (
    SELECT 
        Tenant, 
        MparticleUserID,
        max(TransactionChannel) as TransactionChannel,
        OrderId,
        OrderDate,
        (OrderDateTime) as OrderDateTime,
        transaction_type,
        coalesce(IsReturn,0) as IsReturn,
        Currency,
        max(store) as store,
        StoreName,
        concat_ws('|', sort_array(array_distinct(collect_list(PromoId)))) AS PromoId,
        max(ShippingCountryCode) as ShippingCountryCode,
        coalesce(customertype,'Registered') as DerivedCustomerType,
        concat_ws(';', sort_array(array_distinct(collect_list(Category)))) AS OrderCategories,
        concat_ws(';', sort_array(array_distinct(collect_list(SubCategory)))) AS OrderSubCategories,
        count(*) as lines,
        sum( case when discountflag = 'Y' then 1 else 0 end ) as DiscountedLines,
        concat_ws(';', sort_array(array_distinct(collect_list(ProductDescription)))) AS OrderItems,
        SUM(orderquantity) AS orderquantity,
        sum(salesquantity) AS salesquantity,
        SUM(CASE WHEN PriceType = 'MD' THEN 1 ELSE 0 END) AS no_of_md,
        SUM(CASE WHEN PriceType = 'FP' THEN 1 ELSE 0 END) AS no_of_fp,
        SUM(CASE WHEN PriceType LIKE '%POS' THEN 1 ELSE 0 END) AS no_of_pos,
        SUM(CASE WHEN PriceType = 'MD' THEN COALESCE(order_value_incl_tax_gbp, sales_value_incl_tax_gbp, 0) END) AS sum_of_md,
        SUM(CASE WHEN PriceType = 'FP' THEN COALESCE(order_value_incl_tax_gbp, sales_value_incl_tax_gbp, 0) END) AS sum_of_fp,
        SUM(CASE WHEN PriceType LIKE '%POS' THEN COALESCE(order_value_incl_tax_gbp, sales_value_incl_tax_gbp, 0) END) AS sum_of_pos,
        SUM(COALESCE(order_value_incl_tax, sales_value_incl_tax, 0)) AS order_value_incl_tax,
        SUM(COALESCE(order_value_incl_tax_usd, sales_value_incl_tax_usd, 0)) AS order_value_incl_tax_usd,
        SUM(COALESCE(order_value_incl_tax_gbp, sales_value_incl_tax_gbp, 0)) AS order_value_incl_tax_gbp,
        SUM(COALESCE(sales_value_incl_tax, 0)) AS sales_value_incl_tax,
        SUM(COALESCE(sales_value_incl_tax_usd, 0)) AS sales_value_incl_tax_usd,
        SUM(COALESCE(sales_value_incl_tax_gbp, 0)) AS sales_value_incl_tax_gbp,
        max(EffectiveFrom) as order_EffectiveFrom
    FROM sales
    GROUP BY 
        Tenant, MparticleUserID,  OrderId, OrderDate,StoreName,
        (OrderDateTime), transaction_type, Currency,  coalesce(IsReturn,0), customertype
)
    SELECT *,

        CASE 
            WHEN no_of_md = GREATEST(no_of_md, no_of_fp, no_of_pos) AND sum_of_md = 
                GREATEST(
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_md THEN sum_of_md ELSE 0 END,
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_fp THEN sum_of_fp ELSE 0 END,
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_pos THEN sum_of_pos ELSE 0 END
                )
            THEN 'MD'

            WHEN no_of_fp = GREATEST(no_of_md, no_of_fp, no_of_pos) AND sum_of_fp = 
                GREATEST(
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_md THEN sum_of_md ELSE 0 END,
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_fp THEN sum_of_fp ELSE 0 END,
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_pos THEN sum_of_pos ELSE 0 END
                )
            THEN 'FP'

            WHEN no_of_pos = GREATEST(no_of_md, no_of_fp, no_of_pos) AND sum_of_pos = 
                GREATEST(
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_md THEN sum_of_md ELSE 0 END,
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_fp THEN sum_of_fp ELSE 0 END,
                CASE WHEN GREATEST(no_of_md, no_of_fp, no_of_pos) = no_of_pos THEN sum_of_pos ELSE 0 END
                )
            THEN 'POS'
        END AS price_type,

        ROW_NUMBER() OVER(PARTITION BY MparticleUserID, IsReturn ORDER BY OrderDateTime, regexp_replace(OrderId, '[^0-9]', '')) AS ORDER_SEQ,
        CASE WHEN ROW_NUMBER() OVER(PARTITION BY MparticleUserID, IsReturn ORDER BY OrderDateTime DESC, OrderId Desc) = 1 THEN 1 ELSE 0 END AS LAST_ORDER_FLAG

    FROM orders F

""")

order_rank_df.createOrReplaceTempView("order_rank_df")

### Orders with Segmentation Dates

In [None]:
orders_df = spark.sql("""
		select
           s.segmentation_date,
           s.period,
           o.*,
           c.CustomerType,
           coalesce(o.order_EffectiveFrom,c.cust_EffectiveFrom) as EffectiveFrom
     from customers c
     left join order_rank_df o
        on c.MparticleUserKey = o.MparticleUserId
        and (o.OrderDateTime) >= (c.validfrom)
        and (o.OrderDateTime) <  (c.validto)
     join segmentation_periods s
       on (o.OrderDateTime) < (s.segmentation_date)
     
""")

##orders_df.show(2)
orders_df.createOrReplaceTempView("Orders_with_segmentation")

### Segmentation Period

In [None]:
period_df = spark.sql("""
		select distinct period, segmentation_date from Orders_with_segmentation
""")

##orders_df.show(2)
period_df.createOrReplaceTempView("seg_periods")

### Order Lines with Segmentation

In [None]:
order_lines_df = spark.sql("""
		select
           s.segmentation_date,
           s.period,
           o.*,
           m.colorDescription as Color,
        m.ProductHierarchy2Text as Category,
        m.ProductHierarchy3Text as SubCategory,
        m.ConsumerTerritory as hero,
        CASE WHEN m.ProductHierarchy2Text= 'Leggings' THEN m.GridValue end as LeggingSize,
        i.MediumDescription as StoreName
     from salesunion o
     join segmentation_periods s
       on o.OrderDateTime < s.segmentation_date
    left join upc m on o.barcode = m.upc
    left join site i on i.site = o.store
    where o.isreturn = 0 
       """)
order_lines_df.createOrReplaceTempView("order_line_segmentation")

## Segmentation

### Item Level Derivations

In [None]:
order_items_df = spark.sql("""
		select
           o.segmentation_date,
           o.period,
           o.MparticleUserId,
           count(distinct barcode ) as total_distinct_items,
           count(distinct Category) as NumberOfCategories
     from order_line_segmentation o
        group by 
            o.segmentation_date,
            o.period,
            o.MparticleUserId
       """)
order_items_df.createOrReplaceTempView("order_items")

### First Order

In [None]:
first_order_df = spark.sql("""
WITH cte_RETURNs AS 
(
    select MparticleUserID,
            OrderId,
            sum(coalesce(r.orderquantity,r.salesquantity,0)) as returnorderquantity
        from order_rank_df r
    where IsReturn = 1 and ORDER_SEQ = 1 
    group by OrderId, MparticleUserID
)
, cte_sales as 
(
SELECT 
    F.MparticleUserID,
    F.OrderId,
    F.TransactionChannel AS FirstOrderSource,
    F.OrderDateTime AS FirstOrderDate,
    F.Currency as FirstOrderCurrency,
    f.store,
    f.StoreName,
    f.PromoId,
    coalesce(f.orderquantity,f.salesquantity,0) as orderquantity,
    f.salesquantity,
    f.ShippingCountryCode,
    COALESCE(F.order_value_incl_tax, F.sales_value_incl_tax) AS Firstordervalue,
    COALESCE(F.order_value_incl_tax_usd, F.sales_value_incl_tax_usd) AS Firstordervalueusd,
    COALESCE(F.order_value_incl_tax_gbp, F.sales_value_incl_tax_gbp) AS Firstordervaluegbp,
    f.PRICE_TYPE AS FIRST_ORDER_PRICE_TYPE,
    F.OrderItems,
    CASE WHEN DiscountedLines/coalesce(lines,1) > 0.5 then 'Sale' else 'Full Price' end as FPorSale,
    F.OrderCategories
FROM order_rank_df F
WHERE F.ORDER_SEQ = 1 AND F.ISReturn = 0
)
SELECT 
s.*,coalesce(r.returnorderquantity,0) as returnorderquantity
from cte_sales s 
left join cte_returns r on s.OrderId = r.OrderId and s.MparticleUserID = r.MparticleUserID

""")

##first_order_df.show(2)
first_order_df.createOrReplaceTempView("first_order")

### First Order Price Type

In [None]:
first_order_pt_df = spark.sql("""
with sales as 
(
    SELECT 
        F.MparticleUserID,
        F.OrderId,
        F.OrderDateTime ,
        F.Currency,
        SUM(COALESCE(F.order_value_incl_tax, F.sales_value_incl_tax)) AS TOTAL_AMOUNT,
        SUM(CASE WHEN F.pricetype LIKE 'FP%' THEN COALESCE(F.order_value_incl_tax_gbp, F.sales_value_incl_tax_gbp) ELSE 0 END) AS FP_AMOUNT,
		SUM(CASE WHEN F.pricetype LIKE 'MD%' THEN COALESCE(F.order_value_incl_tax_gbp, F.sales_value_incl_tax_gbp) ELSE 0 END) AS MD_AMOUNT,
		SUM(CASE WHEN F.PROMOID <> '' and F.PROMOID is not null  THEN COALESCE(F.order_value_incl_tax_gbp, F.sales_value_incl_tax_gbp) ELSE 0 END) AS PROMO_AMOUNT
    FROM order_rank_df o 
    join order_line_segmentation F on o.orderid = f.orderid and o.mparticleuserid = f.mparticleuserid and f.OrderDateTime = o.OrderDateTime
    WHERE o.ORDER_SEQ = 1 AND o.ISReturn = 0
    and o.MPARTICLEUSERID IS NOT NULL 
    GROUP BY F.MparticleUserID,
        F.OrderId,
        F.OrderDateTime ,
        F.Currency
)
SELECT 
    s.*,
    CASE WHEN  TOTAL_AMOUNT = 0 THEN 'MIXED'
			WHEN 100 * FP_AMOUNT/TOTAL_AMOUNT >= 70 THEN 'FP'
			WHEN 100 * MD_AMOUNT/TOTAL_AMOUNT >= 70 THEN 'MD'
			WHEN 100 * PROMO_AMOUNT/TOTAL_AMOUNT >= 70 THEN 'PROMO'
			ELSE 'MIXED'
		END CUSTOMER_FIRST_ORDER_PRICE_TYPE_V2
from sales s 

""")

##first_order_df.show(2)
first_order_pt_df.createOrReplaceTempView("first_order_pt_df")

### Customer 12M Rolling Price Type

In [None]:
cust_12m_rolling_pricetype_df = spark.sql("""
with sales as 
(
    SELECT 
        F.MparticleUserID,
        F.segmentation_date,
        F.period,
        min(F.OrderDateTime) as min_OrderDateTime ,
        SUM(COALESCE(F.order_value_incl_tax, F.sales_value_incl_tax)) AS TOTAL_AMOUNT,
        SUM(CASE WHEN F.pricetype LIKE 'FP%' THEN COALESCE(F.order_value_incl_tax_gbp, F.sales_value_incl_tax_gbp) ELSE 0 END) AS FP_AMOUNT,
		SUM(CASE WHEN F.pricetype LIKE 'MD%' THEN COALESCE(F.order_value_incl_tax_gbp, F.sales_value_incl_tax_gbp) ELSE 0 END) AS MD_AMOUNT,
		SUM(CASE WHEN F.PROMOID <> '' and F.PROMOID is not null  THEN COALESCE(F.order_value_incl_tax_gbp, F.sales_value_incl_tax_gbp) ELSE 0 END) AS PROMO_AMOUNT
    FROM order_line_segmentation F
    WHERE F.ISReturn = 0
    and F.MPARTICLEUSERID IS NOT NULL 
    and F.orderdatetime > add_months(segmentation_date, -12)
    GROUP BY F.MparticleUserID,
            F.segmentation_date,
            F.period
)
SELECT 
    s.*,
    CASE WHEN  coalesce(TOTAL_AMOUNT,0) = 0 THEN 'INACTIVE'
			WHEN 100 * FP_AMOUNT/TOTAL_AMOUNT >= 70 THEN 'FP'
			WHEN 100 * MD_AMOUNT/TOTAL_AMOUNT >= 70 THEN 'MD'
			WHEN 100 * PROMO_AMOUNT/TOTAL_AMOUNT >= 70 THEN 'PROMO'
			ELSE 'MIXED'
		END CUSTOMER_12M_ROLLING_PRICETYPE
from sales s 

""")

##first_order_df.show(2)
cust_12m_rolling_pricetype_df.createOrReplaceTempView("cust_12m_rolling_pricetype_df")

### Second Order

In [None]:
second_order_df = spark.sql("""
WITH RETURNs AS 
(
    select MparticleUserID,
            OrderId,
            sum(coalesce(r.orderquantity,r.salesquantity,0)) as returnorderquantity
        from order_rank_df r
    where IsReturn = 1 and ORDER_SEQ = 2 
    group by OrderId, MparticleUserID
)
, sales as 
(
SELECT 
    S.MparticleUserID,
    s.OrderId,
    S.Currency as SecondOrderCurrency,
    S.TransactionChannel AS SecondOrderSource,
    S.OrderDateTime AS SecondOrderDate,
    s.store,
    s.StoreName,
    s.PromoId,
    coalesce(s.orderquantity,s.salesquantity,0) as orderquantity,
    s.salesquantity,
    COALESCE(S.order_value_incl_tax, S.sales_value_incl_tax) AS Secondordervalue,
    COALESCE(S.order_value_incl_tax_usd, S.sales_value_incl_tax_usd) AS Secondordervalueusd,
    COALESCE(S.order_value_incl_tax_gbp, S.sales_value_incl_tax_gbp) AS Secondordervaluegbp,
    s.PRICE_TYPE AS Second_ORDER_PRICE_TYPE,
    s.OrderItems,
    S.OrderCategories
FROM order_rank_df S 
WHERE S.ORDER_SEQ = 2 AND S.ISReturn = 0
)
SELECT 
s.*,coalesce(r.returnorderquantity,0) as returnorderquantity
from sales s 
left join returns r on s.OrderId = r.OrderId and s.MparticleUserID = r.MparticleUserID

""")

##second_order_df.show(2)
second_order_df.createOrReplaceTempView("second_order")

### Last Order

In [None]:
last_order_df = spark.sql("""
WITH base_data AS (
    SELECT 
        o.MparticleUserId,
        o.segmentation_date,
        o.period,
        o.orderid,
        o.orderdatetime as LastOrderDateTime,
        o.TransactionChannel as LastOrderSource,
        o.store,
        o.StoreName,
        o.PromoId,
        o.IsReturn,
        o.customertype,
        o.DerivedCustomerType,
        o.currency as LastOrderCurrency,
        o.price_type as Last_order_price_type,
        SUM(COALESCE(o.order_value_incl_tax_gbp, o.sales_value_incl_tax_gbp)) AS LastordervalueGBP,
        SUM(COALESCE(o.order_value_incl_tax, o.sales_value_incl_tax)) AS Lastordervalue,
        SUM(COALESCE(o.order_value_incl_tax_usd, o.sales_value_incl_tax_usd)) AS Lastordervalueusd,
        SUM(COALESCE(o.orderquantity, o.salesquantity,0)) AS orderquantity,
        concat_ws(';', sort_array(array_distinct(collect_list(o.OrderItems)))) as OrderItems
    FROM Orders_with_segmentation o
    GROUP BY 
        o.MparticleUserId,
        o.segmentation_date,
        o.period,
        o.orderid,
        o.orderdatetime,
        o.TransactionChannel,
        o.store,
        o.StoreName,
        o.PromoId,
        o.currency,
        o.price_type,
        o.IsReturn,
        o.customertype,
        o.DerivedCustomerType
),
last_order AS (
    SELECT *,
        ROW_NUMBER() OVER (
            PARTITION BY MparticleUserId, period, segmentation_date, IsReturn
            ORDER BY LastOrderDateTime DESC, orderid DESC
        ) AS rn
    FROM base_data
)
, final_data as (
SELECT * 
FROM last_order
WHERE rn = 1
)
select s.* , coalesce(r.orderquantity,0) as returnorderquantity
from final_data s 
left join final_data r on s.orderid = r.orderid 
                and s.segmentation_Date = r.segmentation_date 
                and s.period = r.period                 
                and r.isReturn = 1
                and s.MparticleUserId = r.MparticleUserId
where s.isreturn = 0 
""")

last_order_df.createOrReplaceTempView("last_order")

### Total Orders

In [None]:
total_order_df = spark.sql("""
    SELECT 
        o.MparticleUserId,
        o.segmentation_date,
        o.period,        
        SUM(case when ISReturn = 0 then COALESCE(o.order_value_incl_tax_gbp, o.sales_value_incl_tax_gbp,0) else 0 end) AS TotalordervalueGBP,
        SUM(case when ISReturn = 0 then COALESCE(o.order_value_incl_tax_usd, o.sales_value_incl_tax_usd,0)  else 0 end) AS Totalordervalueusd,
        SUM(case when ISReturn = 0 then COALESCE(o.sales_value_incl_tax_gbp,0) else 0 end) AS TotalSalesvalueGBP,
        SUM(case when ISReturn = 0 then COALESCE(o.sales_value_incl_tax_usd,0) else 0 end) AS TotalSalesvalueusd,
        SUM(COALESCE(o.orderquantity, o.salesquantity)) AS TotalSalesQuantity,
        SUM(o.salesquantity) AS TotalOrderQuantity,
        SUM(case when ISReturn = 0 then COALESCE(o.order_value_incl_tax_gbp, o.sales_value_incl_tax_gbp,0) else 0 end) AS CustomerLifeTimeValueGBP,
        SUM(case when ISReturn = 0 then COALESCE(o.order_value_incl_tax_usd, o.sales_value_incl_tax_usd,0) else 0 end) AS CustomerLifeTimeValueUSD,
        sum(case when ISReturn = 1 then coalesce(order_value_incl_tax_gbp,sales_value_incl_tax_gbp,0) end) as totalreturnamountGBP,
        sum(case when ISReturn = 1 then coalesce(order_value_incl_tax_usd,sales_value_incl_tax_usd,0) end) as totalreturnamountUSD,
        COUNT(DISTINCT CASE WHEN orderdatetime >= DATEADD(YEAR, -1, segmentation_date) THEN orderid END) AS number_of_orders_last_12_month,
        COUNT(DISTINCT case when ISReturn = 0 then orderid end) AS TotalOrderCount,
        COUNT(DISTINCT case when isreturn = 0 then orderid end) AS TotalSalesCount,
        COUNT(DISTINCT case when lower(TransactionChannel) = 'retail' then orderid end) AS TotalOrderCountRetail,
        COUNT(DISTINCT case when lower(TransactionChannel) = 'digital' then orderid end) AS TotalOrderCountWeb,
        COUNT(DISTINCT case when ISReturn = 0 and lower(TransactionChannel) = 'retail' then orderid end) AS TotalOrderCountRetailSales,
        COUNT(DISTINCT case when ISReturn = 0 and lower(TransactionChannel) = 'digital' then orderid end) AS TotalOrderCountWebSales,
        min(case when lower(TransactionChannel) = 'retail' then cast(orderdatetime as date) end) as firstretailorderdate,
        min(case when lower(TransactionChannel) = 'digital' then cast(orderdatetime as date) end) as firstweborderdate,
        max(EffectiveFrom) as EffectiveFrom


    FROM Orders_with_segmentation o
    GROUP BY 
        o.MparticleUserId,
        o.segmentation_date,
        o.period
""")

total_order_df.createOrReplaceTempView("total_order")

### Percentage discount

In [None]:
pct_disc_df = spark.sql("""
WITH base_data AS (
  SELECT 
    MparticleUserId,
    segmentation_date,
    period,
    orderdatetime,
    sales_original_rrp_value_incl_tax_gbp,
    sales_pos_rrp_value_incl_tax_gbp,
    CASE 
      WHEN sales_original_rrp_value_incl_tax_gbp - sales_pos_rrp_value_incl_tax_gbp > 0 THEN 1 
      ELSE 0 
    END AS is_discounted
  FROM Order_line_segmentation
),
windowed_data AS (
  SELECT *,
         SUM(is_discounted) OVER (PARTITION BY MparticleUserId) AS numberOfDiscountedItems,
         COUNT(is_discounted) OVER (PARTITION BY MparticleUserId) AS numberOfItems,
         ROW_NUMBER() OVER (
           PARTITION BY MparticleUserId, period, segmentation_date 
           ORDER BY orderdatetime ASC
         ) AS rn
  FROM base_data
)
SELECT 
  MparticleUserId,
  segmentation_date,
  period,
  orderdatetime,
  sales_original_rrp_value_incl_tax_gbp,
  sales_pos_rrp_value_incl_tax_gbp,
  is_discounted,
  numberOfDiscountedItems,
  numberOfItems,
  numberOfDiscountedItems / COALESCE(numberOfItems, 1) AS ratio
FROM windowed_data
WHERE rn = 1
""")

pct_disc_df.createOrReplaceTempView("pct_disc")

### Fav Category

In [None]:
fav_cat_df = spark.sql("""
WITH ranked_data AS (
  SELECT 
    MparticleUserID,
    segmentation_date,
    period,
    category,
    orderdatetime,
    SUM(salesquantity) as sales_quntity,
    SUM(order_value_incl_tax_gbp) AS sum_of_order_value,
    COUNT(orderid) AS cnt_order,
    ROW_NUMBER() OVER (
      PARTITION BY MparticleUserID, period, segmentation_date
      ORDER BY 
        SUM(salesquantity) DESC,
        SUM(order_value_incl_tax_gbp) DESC,
        orderdatetime DESC,
        category DESC
    ) AS rn
  FROM order_line_segmentation
  WHERE category IS NOT NULL
  and orderdatetime > add_months(segmentation_date, -12)
  GROUP BY 
    MparticleUserID,
    segmentation_date,
    period,
    category,
    orderdatetime
)
SELECT *
FROM ranked_data
WHERE rn = 1
""")

fav_cat_df.createOrReplaceTempView("fav_cat")

### Fav SubCategory

In [None]:
fav_subcat_df = spark.sql("""
WITH ranked_data AS (
  SELECT 
    MparticleUserID,
    segmentation_date,
    period,
    Subcategory,
    orderdatetime,
    SUM(salesquantity) as sales_quntity,
    SUM(order_value_incl_tax_gbp) AS sum_of_order_value,
    COUNT(orderid) AS cnt_order,
    ROW_NUMBER() OVER (
      PARTITION BY MparticleUserID, period, segmentation_date
      ORDER BY 
        SUM(salesquantity) DESC,
        SUM(order_value_incl_tax_gbp) DESC,
        orderdatetime DESC,
        Subcategory DESC
    ) AS rn
  FROM order_line_segmentation
  WHERE Subcategory IS NOT NULL
  and orderdatetime > add_months(segmentation_date, -12)
  GROUP BY 
    MparticleUserID,
    segmentation_date,
    period,
    Subcategory,
    orderdatetime
)
SELECT *
FROM ranked_data
WHERE rn = 1
""")

fav_subcat_df.createOrReplaceTempView("fav_subcat")

### Fav Colour

In [None]:
fav_color_df = spark.sql("""
WITH ranked_data AS (
  SELECT 
    MparticleUserID,
    segmentation_date,
    period,
    Color,
    orderdatetime,
    SUM(salesquantity) as sales_quntity,
    SUM(order_value_incl_tax_gbp) AS sum_of_order_value,
    COUNT(orderid) AS cnt_order,
    ROW_NUMBER() OVER (
      PARTITION BY MparticleUserID, period, segmentation_date
      ORDER BY 
        SUM(salesquantity) DESC,
        SUM(order_value_incl_tax_gbp) DESC,
        orderdatetime DESC,
        Color DESC
    ) AS rn
  FROM order_line_segmentation
  WHERE Color IS NOT NULL
  and orderdatetime > add_months(segmentation_date, -12)
  GROUP BY 
    MparticleUserID,
    segmentation_date,
    period,
    Color,
    orderdatetime
)
SELECT *
FROM ranked_data
WHERE rn = 1
""")

fav_color_df.createOrReplaceTempView("fav_color")

### Fav Hero

In [None]:
fav_hero_df = spark.sql("""
WITH ranked_data AS (
  SELECT 
    MparticleUserID,
    segmentation_date,
    period,
    hero,
    orderdatetime,
    SUM(salesquantity) as sales_quntity,
    SUM(order_value_incl_tax_gbp) AS sum_of_order_value,
    COUNT(orderid) AS cnt_order,
    ROW_NUMBER() OVER (
      PARTITION BY MparticleUserID, period, segmentation_date
      ORDER BY 
        SUM(salesquantity) DESC,
        SUM(order_value_incl_tax_gbp) DESC,
        orderdatetime DESC,
        hero DESC
    ) AS rn
  FROM order_line_segmentation
  WHERE hero IS NOT NULL
  and orderdatetime > add_months(segmentation_date, -12)
  GROUP BY 
    MparticleUserID,
    segmentation_date,
    period,
    hero,
    orderdatetime
)
SELECT *
FROM ranked_data
WHERE rn = 1
""")

fav_hero_df.createOrReplaceTempView("fav_hero")

### Fav Legging Size

In [None]:
fav_leg_df = spark.sql("""
WITH ranked_data AS (
  SELECT 
    MparticleUserID,
    segmentation_date,
    period,
    LeggingSize,
    orderdatetime,
    SUM(salesquantity) as sales_quntity,
    SUM(order_value_incl_tax_gbp) AS sum_of_order_value,
    COUNT(orderid) AS cnt_order,
    ROW_NUMBER() OVER (
      PARTITION BY MparticleUserID, period, segmentation_date
      ORDER BY 
        SUM(salesquantity) DESC,
        SUM(order_value_incl_tax_gbp) DESC,
        orderdatetime DESC,
        LeggingSize DESC
    ) AS rn
  FROM order_line_segmentation
  WHERE LeggingSize IS NOT NULL
  and orderdatetime > add_months(segmentation_date, -12)
  GROUP BY 
    MparticleUserID,
    segmentation_date,
    period,
    LeggingSize,
    orderdatetime
)
SELECT *
FROM ranked_data
WHERE rn = 1
""")

fav_leg_df.createOrReplaceTempView("fav_leg")

### Fav source

In [None]:
fav_source_df = spark.sql("""
WITH ranked_data AS (
  SELECT 
    MparticleUserID,
    segmentation_date,
    period,
    TransactionChannel,
    orderdatetime,
    SUM(salesquantity) as sales_quntity,
    SUM(order_value_incl_tax_gbp) AS sum_of_order_value,
    COUNT(orderid) AS cnt_order,
    ROW_NUMBER() OVER (
      PARTITION BY MparticleUserID, period, segmentation_date
      ORDER BY 
        SUM(salesquantity) DESC,
        SUM(order_value_incl_tax_gbp) DESC,
        orderdatetime DESC,
        TransactionChannel DESC
    ) AS rn
  FROM order_line_segmentation
  WHERE TransactionChannel IS NOT NULL
  and orderdatetime > add_months(segmentation_date, -12)
  GROUP BY 
    MparticleUserID,
    segmentation_date,
    period,
    TransactionChannel,
    orderdatetime
)
SELECT *
FROM ranked_data
WHERE rn = 1
""")

fav_source_df.createOrReplaceTempView("fav_source")

### Fav Store

In [None]:
fav_store_df = spark.sql("""
WITH ranked_data AS (
  SELECT 
    MparticleUserID,
    segmentation_date,
    period,
    Store,
    StoreName,
    orderdatetime,
    SUM(salesquantity) as sales_quntity,
    SUM(order_value_incl_tax_gbp) AS sum_of_order_value,
    COUNT(orderid) AS cnt_order,
    ROW_NUMBER() OVER (
      PARTITION BY MparticleUserID, period, segmentation_date
      ORDER BY 
        SUM(salesquantity) DESC,
        SUM(order_value_incl_tax_gbp) DESC,
        orderdatetime DESC,
        Store DESC
    ) AS rn
  FROM order_line_segmentation
  WHERE Store IS NOT NULL
  and orderdatetime > add_months(segmentation_date, -12)
  GROUP BY 
    MparticleUserID,
    segmentation_date,
    period,
    Store,
    StoreName,
    orderdatetime
)
SELECT *
FROM ranked_data
WHERE rn = 1
""")

fav_store_df.createOrReplaceTempView("fav_store")

### FP or Sale

In [None]:
fp_sale_df = spark.sql("""
SELECT *
FROM (
    SELECT 
        MparticleUserId,
        segmentation_date,
        period,
        orderdatetime,
        COUNT(*) AS lines,
        CASE 
            WHEN SUM(CASE WHEN discountflag = 'Y' THEN 1 ELSE 0 END) / COALESCE(COUNT(*), 1) > 0.5 THEN 'Sale'
            ELSE 'Full price'
        END AS fp_or_sale,
        ROW_NUMBER() OVER (
            PARTITION BY MparticleUserId, period, segmentation_date
            ORDER BY orderdatetime ASC
        ) AS rn
    FROM order_line_segmentation
    WHERE orderid IS NOT NULL
    and isreturn = 0
    GROUP BY MparticleUserId, segmentation_date, period, orderdatetime
) tmp
WHERE rn = 1
""")

fp_sale_df.createOrReplaceTempView("fp_sale")


### Segmentaion Final Joins

In [None]:
segmentation_prep_df = spark.sql("""
SELECT   
        O.Segmentation_date,
        O.period,
        c.MparticleUserKey,
        c.GlobalCustomerKey,
        c.Email,
        l.customertype,
        case when coalesce(t.TotalOrderCount,0) = 0 then 'Registered'
                when t.TotalOrderCount = 1 then 'New'
                when t.TotalOrderCount > 1 then 'Returning'
          end as       DerivedCustomerType,
        FirstOrderSource,
        cast(FirstOrderDate as timestamp) as FirstOrderDate,
        FirstOrderCurrency,
        Firstordervalue,
        Cast(Firstordervalueusd as decimal(38,3)) as Firstordervalueusd,
        Firstordervaluegbp,
        FIRST_ORDER_PRICE_TYPE as FirstOrderPriceType,
        f.OrderQuantity as FirstOrderNumberOfItemsGross,
        f.OrderQuantity - abs(f.returnorderquantity) as FirstOrderNumberOfItemsNet,
        f.OrderItems as FirstOrderItems,
        F.OrderCategories AS FirstOrderCategories,
        f.Store as FirstOrderStore,
        cast(F.StoreName as varchar(200)) as FirstOrderStoreName,
        f.ShippingCountryCode,

        SecondOrderSource,
        cast(SecondOrderDate as timestamp) as SecondOrderDate,
        SecondOrderCurrency,
        Secondordervalue,
        Cast(Secondordervalueusd as decimal(38,3)) as Secondordervalueusd,
        Secondordervaluegbp,
        Second_ORDER_PRICE_TYPE as SecondOrderPriceType,
        s.store as SecondOrderStore,
        cast(F.StoreName as varchar(200)) as SecondOrderStoreName,
        s.OrderQuantity as SecondOrderNumberOfItemsGross,
        s.OrderQuantity - abs(s.returnorderquantity) as SecondOrderNumberOfItemsNet,
        s.OrderItems as SecondOrderItems,
        s.OrderCategories AS SecondOrderCategories,

        LastOrderSource,
        cast(LastOrderDateTime as timestamp) as LastOrderDate,
        LastOrderCurrency,
        Lastordervalue,
        Cast(Lastordervalueusd as decimal(38,3)) as Lastordervalueusd,
        Lastordervaluegbp,
        Last_ORDER_PRICE_TYPE as LastOrderPriceType,
        l.store as LastOrderStore,
        cast(L.StoreName as varchar(200)) as LastOrderStoreName,
        l.OrderQuantity as LastOrderNumberOfItemsGross,
        l.OrderQuantity - coalesce(abs(l.returnorderquantity),0) as LastOrderNumberOfItemsNet,
        l.OrderItems as LastOrderItems,

        t.TotalordervalueGBP,
        Cast(t.Totalordervalueusd as decimal(38,3)) as Totalordervalueusd,
        t.TotalSalesvalueGBP,
        Cast(t.TotalSalesvalueusd as decimal(38,3)) as TotalSalesvalueusd,
        t.TotalSalescount,
        t.TotalOrdercount,
        t.totalreturnamountGBP,
        t.totalreturnamountUSD,
        t.number_of_orders_last_12_month as NumberOfOrdersLast12Months,
        coalesce(t.CustomerLifeTimeValueGBP,0) as CustomerLifeTimeValueGBP,
        Cast(coalesce(t.CustomerLifeTimeValueUSD,0) as decimal(38,3)) as CustomerLifeTimeValueUSD,
        cast((coalesce(t.TotalSalesvalueGBP,0) + coalesce(t.totalreturnamountGBP,0)) as decimal(38,3)) as NetLifeTimeValueGBP,
        cast((coalesce(t.TotalSalesvalueUSD,0) + coalesce(t.totalreturnamountUSD,0)) as decimal(38,3)) as NetLifeTimeValueUSD,
        cast(round(t.TotalordervalueGBP/t.TotalOrdercount,3) as decimal(38,3)) as AverageOrderValueGBP,
        cast(round(t.TotalordervalueUSD/t.TotalOrdercount,3) as decimal(38,3)) as AverageOrderValueUSD,

        t.TotalOrderCountRetail,
        cast(t.firstretailorderdate as date) as firstretailorderdate,
        cast(t.firstweborderdate as date) as firstweborderdate,

        case when  coalesce(t.TotalSalesCount,0) > 0 then  cast(TotalOrderCountRetail/t.TotalSalesCount as numeric(38,2)) end as PercetageStoreOrders,
        CASE 
                WHEN LastOrderDateTime BETWEEN add_months(o.segmentation_date, -12) AND o.segmentation_date THEN 'Active'
                WHEN LastOrderDateTime BETWEEN add_months(o.segmentation_date, -24) AND add_months(o.segmentation_date, -12) THEN 'Lapsed'
                WHEN LastOrderDateTime < add_months(o.segmentation_date, -24) THEN 'Dormant'
        END AS recency,
        case when TotalOrderCount = 1 then 'Single'
                when TotalOrderCount > 1 then 'Multi'
        end as frequency,
        case when TotalOrderCountRetailSales > 0 and TotalOrderCountWebSales = 0 then 'Retail'
                 when TotalOrderCountRetailSales = 0 and TotalOrderCountWebSales > 0 then 'Digital'
                 when TotalOrderCountRetailSales > 0 and TotalOrderCountWebSales > 0 then 'Omnichannel'
                 else 'No orders/unknown'
            end as ChannelsShopped,
        
        fps.fp_or_sale as NewCustomerAcquiredFPorSale,

        cat.Category as FavouriteCategory ,
        scat.SubCategory as FavouriteSubCategory,
        color.color as FavouriteColor,
        fs.TransactionChannel as FavouriteSource,
        i.total_distinct_items as TotalDistinctItems,
        i.NumberOfCategories,
        cast(pd.ratio as decimal(38,2)) as PercentageDiscountItems,
        st.Store as FavouriteStore,
        st.StoreName as FavouriteStoreName,
        
        f.PromoId as FirstOrderPromoId,
        cast(null as varchar(200))  as FirstOrderStyles,
        s.PromoId as SecondOrderPromoID,
        cast(null as varchar(200))  as SecondOrderStyles,
        l.PromoId  as LastOrderPromoID,
        cast(null as varchar(200))  as LastOrderStyles,
        cast(null as varchar(200))  as FavouriteColourType,
        cast(h.hero as varchar(200))  as FavouriteHero,
        cast(leg.LeggingSize as varchar(200))  as FavouriteLeggingSize,
        fp.CUSTOMER_FIRST_ORDER_PRICE_TYPE_V2 ,
        coalesce(pt.CUSTOMER_12M_ROLLING_PRICETYPE,'INACTIVE') as  CUSTOMER_12M_ROLLING_PRICETYPE,
        t.EffectiveFrom

FROM customers c
join seg_periods o  on o.segmentation_date between to_timestamp(c.validfrom) and to_timestamp(c.validto)
LEFT JOIN First_Order f on c.MparticleUserKey = f.MParticleUserId and f.FirstOrderDate <= o.segmentation_date
LEFT JOIN second_order s on c.MparticleUserKey = s.MParticleUserId and s.SecondOrderDate <= o.segmentation_date
LEFT JOIN last_order l on c.MparticleUserKey = l.MparticleUserId and l.segmentation_date = o.segmentation_date and o.period = l.period
LEFT JOIN total_order t on c.MparticleUserKey = t.MparticleUserId and t.segmentation_date = o.segmentation_date and o.period = t.period
LEFT JOIN fav_cat cat on c.MparticleUserKey = cat.MparticleUserId and cat.segmentation_date = o.segmentation_date and o.period = cat.period
LEFT JOIN fav_subcat scat on c.MparticleUserKey = scat.MparticleUserId and scat.segmentation_date = o.segmentation_date and o.period = scat.period
LEFT JOIN fav_color color on c.MparticleUserKey = color.MparticleUserId and color.segmentation_date = o.segmentation_date and o.period = color.period
LEFT JOIN fav_hero h on c.MparticleUserKey = h.MparticleUserId and h.segmentation_date = o.segmentation_date and o.period = h.period
LEFT JOIN fav_leg leg on c.MparticleUserKey = leg.MparticleUserId and leg.segmentation_date = o.segmentation_date and o.period = leg.period
LEFT JOIN fav_source fs on c.MparticleUserKey = fs.MparticleUserId and fs.segmentation_date = o.segmentation_date and o.period = fs.period
LEFT JOIN order_items i on c.MparticleUserKey = i.MparticleUserId and i.segmentation_date = o.segmentation_date and o.period = i.period
LEFT JOIN pct_disc pd on c.MparticleUserKey = pd.MparticleUserId and pd.segmentation_date = o.segmentation_date and o.period = pd.period
LEFT JOIN fav_store st on c.MparticleUserKey = st.MparticleUserId and st.segmentation_date = o.segmentation_date and o.period = st.period
LEFT JOIN fp_sale fps on c.MparticleUserKey = fps.MparticleUserId and fps.segmentation_date = o.segmentation_date and o.period = fps.period
LEFT JOIN first_order_pt_df fp on c.MparticleUserKey = fp.MparticleUserId and fp.OrderDateTime <= o.segmentation_date
LEFT JOIN cust_12m_rolling_pricetype_df pt on c.MparticleUserKey = pt.MparticleUserId and pt.segmentation_date = o.segmentation_date and o.period = pt.period
""")

segmentation_prep_df.createOrReplaceTempView("segmentation_prep")

### Aggregate Derivations

In [None]:
segmentation_prep2_df = spark.sql("""
SELECT       
    *,
    CONCAT(recency, ' ', frequency) AS lifecycle_stage,
    cast(case when coalesce(o.TotalSalescount,0) > 0 then o.TotalDistinctItems / o.TotalSalescount  end as decimal(38,2))  as average_order_items,
    CASE WHEN DATEDIFF(SecondOrderDate, FirstOrderDate) <= 30 THEN 1 ELSE 0 END as RR_30Days_Flag,
    CASE WHEN DATEDIFF(SecondOrderDate, FirstOrderDate) <= 90 THEN 1 ELSE 0 END as RR_90Days_Flag,
    CASE WHEN DATEDIFF(SecondOrderDate, FirstOrderDate) <= 180 THEN 1 ELSE 0 END as RR_180Days_Flag,
    CASE WHEN DATEDIFF(SecondOrderDate, FirstOrderDate) <= 360 THEN 1 ELSE 0 END as RR_360Days_Flag
FROM segmentation_prep o
""")
segmentation_prep2_df.createOrReplaceTempView("segmentation_prep2")

### hashdiff

In [None]:
segmentation_df = spark.sql("""
SELECT *,
       CAST(
         md5(
           concat_ws('||',
				COALESCE(NULLIF(UPPER(TRIM(CAST(Email AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(customertype AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(DerivedCustomerType AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderSource AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderDate AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderCurrency AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Firstordervalue AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Firstordervalueusd AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Firstordervaluegbp AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderPriceType AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderNumberOfItemsGross AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderNumberOfItemsNet AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderItems AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(NewCustomerAcquiredFPorSale AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderCategories AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderStore AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderStoreName AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(ShippingCountryCode AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderSource AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderDate AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderCurrency AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Secondordervalue AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Secondordervalueusd AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Secondordervaluegbp AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderPriceType AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderStore AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderStoreName AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderNumberOfItemsGross AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderNumberOfItemsNet AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderItems AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderCategories AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderSource AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderDate AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderCurrency AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Lastordervalue AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Lastordervalueusd AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Lastordervaluegbp AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderPriceType AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderStore AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderStoreName AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderNumberOfItemsGross AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderNumberOfItemsNet AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderItems AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(TotalordervalueGBP AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(Totalordervalueusd AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(TotalSalesvalueGBP AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(TotalSalesvalueusd AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(TotalSalescount AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(TotalOrdercount AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(totalreturnamountGBP AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(totalreturnamountUSD AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(NumberOfOrdersLast12Months AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(CustomerLifeTimeValueGBP AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(CustomerLifeTimeValueUSD AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(NetLifeTimeValueGBP AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(NetLifeTimeValueUSD AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(AverageOrderValueGBP AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(AverageOrderValueUSD AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(TotalOrderCountRetail AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(firstretailorderdate AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(firstweborderdate AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(PercetageStoreOrders AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(recency AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(frequency AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(ChannelsShopped AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteCategory AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteSubCategory AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteColor AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteSource AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(TotalDistinctItems AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(NumberOfCategories AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(PercentageDiscountItems AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteStore AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteStoreName AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderPromoId AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FirstOrderStyles AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderPromoID AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(SecondOrderStyles AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderPromoID AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(LastOrderStyles AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteColourType AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteHero AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(FavouriteLeggingSize AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(lifecycle_stage AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(average_order_items AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(CUSTOMER_FIRST_ORDER_PRICE_TYPE_V2 AS STRING))), ''), '^^'),
				COALESCE(NULLIF(UPPER(TRIM(CAST(CUSTOMER_12M_ROLLING_PRICETYPE AS STRING))), ''), '^^')
           )
         ) AS STRING
       ) AS customermetricshashdiff
FROM segmentation_prep2
""")

### Write result into Gold Delta Table

In [None]:
segmentation_df.write.format("delta").mode("append").partitionBy("period", "Segmentation_date").save(target_path)

## Exit

In [None]:
mssparkutils.notebook.exit("0")

## Miscellaneous ad hoc code cells

In [None]:
from pyspark.sql.functions import min, max, count

#segmentation_periods_df.select(min("segmentation_date").alias("min_date"), max("segmentation_date").alias("max_date")).show()
last_order_df.groupBy("period","Segmentation_date").agg(count("*").alias("total_count")).show()

## cust_df
## result_df
## segmentation_periods_df
## order_rank_df
## orders_df
## total_order_df

In [None]:
df = first_order_df.filter(last_order_df["MparticleUserId"] == "-110968071520021894")
df.show(15,truncate=False)

In [None]:
df = spark.sql("""
    SELECT *,
            ROW_NUMBER() OVER(PARTITION BY MparticleUserid ORDER BY cast(OrderDateTime as timestamp), regexp_replace(OrderId, '[^0-9]', '')) AS ORDER_SEQ2
    FROM order_rank_df
    WHERE MparticleUserid = '752429565253543102'
""")
df.show(30,truncate=False)

In [None]:
segmentation_df.createOrReplaceTempView("segmentation_df")

df = spark.sql("""
    SELECT MparticleUserKey,
            GlobalCustomerKey,
            Segmentation_date,
            Period,
            customermetricshashdiff
    FROM segmentation_df c
    WHERE c.MparticleUserKey = '2827407266731948222'
    order by segmentation_date
""")
df.show(30,truncate=False)

In [None]:
##segmentation_df.createOrReplaceTempView("segmentation_df")

df = spark.sql("""
    SELECT MparticleUserKey,
            GlobalCustomerKey,
            Segmentation_date,
            Period,
            ValidFrom,
            ValidTo
    FROM customers c
    join seg_periods o  on o.segmentation_date between to_timestamp(c.validfrom) and to_timestamp(c.validto)
    WHERE c.MparticleUserKey = '2827407266731948222'
    order by segmentation_date, ValidFrom
""")
df.show(30,truncate=False)

In [None]:
segmentation_df.createOrReplaceTempView("segmentation_df")

df = spark.sql("""
    SELECT 
            Segmentation_date,
            Period,
            count(*)
    FROM segmentation_prep2 c
    group by Segmentation_date,
            Period       
    order by segmentation_date desc
""")
df.show(30,truncate=False)

In [None]:
df = spark.sql("""
SELECT 
    max(OrderDateTime)
FROM    salesunion
""")
df.show(10, truncate = False)

In [None]:
cust_12m_rolling_pricetype_df.filter(col("MparticleUSerId") == '-1000354369157909164').selectExpr('MparticleUserId','min_OrderDateTime','CUSTOMER_12M_ROLLING_PRICETYPE').show(truncate=False)

### delete delta partition

In [None]:
spark.sql("""DELETE FROM delta.`abfss://gold@azwwwnonproddevadapadls.dfs.core.windows.net/CustomerSegmentation/CustomerSegmentationHistory/`
where period = 'weekly' and segmentation_date > '2025-07-01'
""")