## 1. Imports and Enviroment

In [0]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.functions import col, concat_ws
from functools import reduce

env_path = "abfss://d3-shared-data@cdlprdadl2weu.dfs.core.windows.net"

## 2. Loading the Data

In [0]:
# Paths to the Delta and CSV tables
product_ref_main_path = f"{env_path}/30_datamart_D3/PDN/PRODUCT_REF/PRODUCT_REF_MAIN"
product_path = f"{env_path}/30_datamart_D3/PDN/NACR/PRODUCT/PRODUCT.parquet"

In [0]:
# Load PDN Delta and Parquet tables
product_main_df = spark.read.format("delta").load(product_ref_main_path)
product_df = spark.read.format("parquet").load(product_path)

## 3. Comparing the Tables

In [0]:
# Select relevant columns and filter null MSPNs
product_sel = product_df.select(
    "MSPN_NBR",
    "INVENTORY_IND", "BUSINESS_CATEG_CODE", "BUSINESS_LINE_CODE",
    "COMPETITOR_FLAG", "PRODUCT_CATEG_CODE", "PRODUCT_TYPE",
    "RETREAD_FLAG", "GL_PRODUCT_CODE"
).dropna(subset=["MSPN_NBR"])

product_main_keys = product_main_df.select("MSPN").dropna()

# Find records in PRODUCT that are missing in PRODUCT_REF_MAIN by MSPN
missing_known_df = product_sel.join(product_main_keys, product_sel["MSPN_NBR"] == product_main_keys["MSPN"], how="left_anti")

# Join with product_main_df to enrich with extra fields if available
product_main_lookup = product_main_df.select("MSPN", "ITEM_NUMBER", "ITEM_CLASS", "ITEM_TYPE", "ITEM_SHORT_DESC")

missing_known_enriched = missing_known_df.join(product_main_lookup, missing_known_df["MSPN_NBR"] == product_main_lookup["MSPN"], how="left")

# Final result: include all requested columns
missing_known_final = missing_known_enriched.select(
    col("MSPN_NBR").alias("MSPN"),
    "INVENTORY_IND", "BUSINESS_CATEG_CODE", "BUSINESS_LINE_CODE",
    "COMPETITOR_FLAG", "PRODUCT_CATEG_CODE", "PRODUCT_TYPE",
    "RETREAD_FLAG", "GL_PRODUCT_CODE"
)


In [0]:
# missing_known_final: records in PRODUCT not in PRODUCT_REF_MAIN (expected to be missing)
# missing_count = missing_known_final.count()
# print(f"Number of missing records: {missing_count}")
# display(missing_known_final)
# missing_known_final.write.csv("/mnt/data/missing_known.csv", header=True)

In [0]:
display(dbutils.fs.ls("dbfs:/mnt/data/missing_known.csv"))

path,name,size,modificationTime
dbfs:/mnt/data/missing_known.csv/_SUCCESS,_SUCCESS,0,1748980678000
dbfs:/mnt/data/missing_known.csv/_committed_77592665756723204,_committed_77592665756723204,112,1748980678000
dbfs:/mnt/data/missing_known.csv/_started_77592665756723204,_started_77592665756723204,0,1748980677000
dbfs:/mnt/data/missing_known.csv/part-00000-tid-77592665756723204-b7c5fee5-0fb5-4455-ae7b-9a2313f9dc67-7555-1-c000.csv,part-00000-tid-77592665756723204-b7c5fee5-0fb5-4455-ae7b-9a2313f9dc67-7555-1-c000.csv,19393701,1748980678000


In [0]:
missing_known_final.toPandas().to_csv("/dbfs/FileStore/missing_known.csv", index=False)