# Revision History

In [None]:
# Change_date         revision_number     change_description                           author
# 02/16/2024          1                   initial check-in                             Kranthi

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType,DateType
spark.conf.set("spark.sql.sources.partitionOverwriteMode","DYNAMIC")
from pyspark.sql.window import *
#from pyspark.sql.functions import row_number

In [None]:
%run /utils/common_functions

# define file schema

In [None]:
schema = StructType([
    StructField("Warehouse", StringType(), True),
    StructField("Plant", StringType(), True),
    StructField("CartonNbr", StringType(), True),
    StructField("PoNbr", StringType(), True),
    StructField("PoLine", StringType(), True),
    StructField("Material", StringType(), True),
    StructField("Size", StringType(), True),
    StructField("Width", StringType(), True),
    StructField("UPC", StringType(), True),
    StructField("CaseQuantity", DoubleType(), True),
    StructField("DateReceived", IntegerType(), True),
    StructField("InventoryLockCode", StringType(), True),
    StructField("InventoryLockCode2", StringType(), True),
    StructField("InventoryLockCode3", StringType(), True),
    StructField("InventoryLockCode4", StringType(), True),
    StructField("InventoryLockCode5", StringType(), True),
    StructField("SnapshotDate", IntegerType(), True),
    StructField("PurchasingDocumentNumber", StringType(), True),
    StructField("PurchasingDocumentItem", StringType(), True)
])

# Validate the number of files and date

In [None]:
from datetime import datetime

class InvalidInputError(Exception):
    pass

file_cnt = 0
file_dates = []
today_dt = datetime.today().strftime('%Y-%m-%d')
print("today_dt::",today_dt)
for j in mssparkutils.fs.ls(f'{raw_adls_path}AS400/INVDATA'):
  if j.size>0:  ## ignore archive folder
    file_dates.append(datetime.strftime(datetime.strptime(j.name.split('_')[0],'%Y%m%d'),'%Y-%m-%d'))
    file_cnt = file_cnt+1
print('cnt::',file_cnt, 'file_date::',file_dates)
print("set to string date::",''.join(set(file_dates))) ## convert set to string
try: 
    if (file_cnt == 4 and ''.join(set(file_dates)) ==  today_dt):
        print('count is 4 and all dates belong to Today - continue processing')
    else:
        raise InvalidInputError("Incorrect date or # of files") 
except Exception as e:
    print("Error::", str(e))
    raise                

# Move the data to Gold layer - original code

In [None]:
df_raw = spark.read.format('csv')\
       .option("header", "false")\
       .schema(schema)\
       .load(f"{raw_adls_path}AS400/INVDATA/")
df = df_raw.filter('SnapshotDate is not null').selectExpr("Warehouse"
,"Plant"
,"CartonNbr" 
,"PoNbr" 
,'substring(''PoNbr'',1,10) as PurchasingDocumentNumber'
,'substring(''PoNbr'',11,5) as PurchasingDocumentItem'
,"PoLine" 
,"trim(Material) as Material" 
,"Size" 
,"Width" 
,"UPC" 
,"CaseQuantity"
,"to_date(cast(DateReceived as string),'yyyyMMdd') as DateReceived"
,"InventoryLockCode" 
,"InventoryLockCode2" 
,"InventoryLockCode3" 
,"InventoryLockCode4" 
,"InventoryLockCode5" 
,"to_date(cast(SnapshotDate as string),'yyyyMMdd') as SnapshotDate" 
)

df.repartition('SnapshotDate')\
    .write.format("delta")\
    .mode("overwrite")\
    .option("path",f"{gold_adls_path}AS400/INVDATA/")\
    .option("replaceWhere", f"SnapshotDate='{today_dt}'")\
    .option("mergeSchema", "true")\
    .partitionBy('SnapshotDate')\
    .saveAsTable('lakedb_gold.pfas_snapshotdata')
    
if file_cnt ==4:
  for j in mssparkutils.fs.ls(f"{raw_adls_path}AS400/INVDATA"):
    if j.size>0:  
      print(f'moving ', j.name, ' to archive' )
      mssparkutils.fs.mv(f"{raw_adls_path}AS400/INVDATA/{j.name}", f"{raw_adls_path}AS400/INVDATA/archive/{j.name}",overwrite=True)    

# move material look up file to gold

In [None]:
lkp_up_schema = StructType([
    StructField("WWWSeason", StringType(), True),
    StructField("Brand", StringType(), True),
    StructField("VendorNumber", StringType(), True),
    StructField("Vendor", StringType(), True),
    StructField("Pattern", StringType(), True),
    StructField("Material", StringType(), True),
    StructField("Pairs", StringType(), True),
    StructField("CurrentXFDate", StringType(), True),
    StructField("SAPPO", StringType(), True),
    StructField("SAPPOItem", StringType(), True),
    StructField("CorporateRegion", StringType(), True),
    StructField("PriorToF23orders", StringType(), True),
    StructField("PriorToF24Gore", StringType(), True),
    StructField("YKKZippersUsed", StringType(), True),
    StructField("PriorToS25BOA", StringType(), True),
    StructField("Yokota1stOrdersinBD", StringType(), True),
    StructField("AgentProduct", StringType(), True),
    StructField("PFASUNDER50PPMShipment", StringType(), True),
    StructField("VibranOutsoleUsed", StringType(), True),
    StructField("PFASUNDER20PPMShipment", StringType(), True),
    StructField("PFASUNDER20to50PPMShipment", StringType(), True),
    StructField("WpOrGoreTexCertified", StringType(), True),
    StructField("NonZSKUs", StringType(), True),
    StructField("PocGtnReceiptDate", StringType(), True),
    StructField("FactoryGroup", StringType(), True),
])

df_lkp = spark.read.format('csv')\
       .option("header", "true")\
       .schema(lkp_up_schema)\
       .load(f"{raw_adls_path}AS400/PFAS_Materials_Flag.csv")

#print(df_lkp.dtypes) 
display(df_lkp) 
df_lkp.write.format("delta")\
    .mode("overwrite")\
    .option("path",f"{gold_adls_path}AS400/po_lkp")\
    .option("mergeSchema", "true")\
    .saveAsTable('lakedb_gold.pfas_po_lkp')

df_lkp = spark.read.format('csv')\
       .option("header", "true")\
       .schema(lkp_up_schema)\
       .load(f"{raw_adls_path}AS400/PFAS_Materials_Flag.csv")   

relabel_schema = StructType([
    StructField("Brand", StringType(), True),    
    StructField("Material", StringType(), True),
    StructField("IsRelabelingRequired", StringType(), True),
    StructField("NewMaterialCreated", StringType(), True),
    StructField("NewMaterialNumber", StringType(), True),
    StructField("Status", StringType(), True)
])

df_relabel = spark.read.format('csv')\
       .option("header", "true")\
       .schema(relabel_schema)\
       .load(f"{raw_adls_path}AS400/PFASMaterialRelabel.csv")

#print(df_lkp.dtypes) 
display(df_relabel) 
df_relabel.write.format("delta")\
    .mode("overwrite")\
    .option("path",f"{gold_adls_path}AS400/material_relabel")\
    .option("mergeSchema", "true")\
    .saveAsTable('lakedb_gold.pfas_material_relabel')        
     


# Validate the end result

In [None]:
%%sql
--drop table lakedb_gold.pfas_snapshotdata;
select * from lakedb_gold.pfas_snapshotdata;
-- where snapshotdate = date_format(CURRENT_DATE,'yyyy-MM-dd')

# Check the raw data

In [None]:
spark.sql(f"create table if not exists raw.pfs_raw_data USING CSV LOCATION '{raw_adls_path}AS400/INVDATA/archive'") 


In [None]:
%%sql
select * from raw.pfs_raw_data;