In [None]:
# Change_date         revision_number     change_description                           author
# 09/01/2023           1                   initial check-in                             Kranthi/Chad
# 01/10/2024           2                   run optimize,vacuum for all folders          Kranthi
# 01/15-22/2024        3                   Optimize code                                Chad
# 04/29/2024           4                   Add bronze table                             Chad
# 14Feb2025     KETTNECH Save output to bronze versus raw container
# 2025-06-17 KETTNECH Add more folders/paths and trimmed text output drastically

In [None]:
import concurrent.futures
from delta import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
from pyspark.sql.functions import *

spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")

In [None]:
%run /utils/common_functions

In [None]:
zorder_dict = {'AccountsReceivable':'documentdate'
, 'InventoryAging':'calendarday'
, 'InTransitsbyFicalPeriod': 'calendarday'
, 'plmProjectsCurrent':'calendarday'
, 'RetailInventoryMovements':'calendarday'
}

In [None]:
#Create empty dataframe to hold data
schema = StructType([
  StructField('id', StringType(), True),
  StructField('location', StringType(), True),
  StructField('table', StringType(), True),
  StructField('createdAt', StringType(), True),
  StructField('lastModified', StringType(), True),
  StructField('numFiles', IntegerType(), True),
  StructField('partitionedBy', StringType(), True),
  StructField('sizeInBytes', LongType(), True)
  ])
deltas_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

In [None]:
#For optimizing( optimize(reduce large number of small files and vacuum))
def optimizeDeltaTable(delta_table_path, duration=24):
  global deltas_df
  output_str = ''
  output_str2 = ''
  if DeltaTable.isDeltaTable(spark, delta_table_path):
    output_str = delta_table_path
  else:
    #print(delta_table_path,'is not a delta table.\n')
    return
  
  try:
    folder_name = delta_table_path.split('/')[-1]
    p_zorder = zorder_dict[folder_name]
  except:
    p_zorder = ''
  
  try:
    df = spark.sql(f'DESCRIBE DETAIL delta.`{delta_table_path}`')
    if df.select('numFiles').first()[0] == 0:
      output_str = output_str+' .. no files, no need to optimize.'
    elif df.select('numFiles').first()[0] == 1:
      output_str = output_str+' .. only 1 file, no need to optimize.'
    else:
      output_str2 = ' .. optimizing details:\n'+df.select('partitionColumns','numFiles','sizeInBytes')._jdf.showString(20, 0, True)

      if p_zorder:
        optimize_cmd = f'optimize delta.`{delta_table_path}` ZORDER BY ({p_zorder})'
      else:
        optimize_cmd = f'optimize delta.`{delta_table_path}`'
      if duration:
        pass
      else:
        duration = 24
    
      after_optimze_stats = spark.sql(optimize_cmd)
      if after_optimze_stats.select('metrics.numFilesAdded').first()[0] > 0:
        output_str = output_str+output_str2+after_optimze_stats.select('metrics.numFilesAdded','metrics.numFilesRemoved',
                                'metrics.partitionsOptimized','metrics.zOrderStats')._jdf.showString(20, 0, True)

        df = spark.sql(f'DESCRIBE DETAIL delta.`{delta_table_path}`')
        output_str = output_str+df.select('numFiles','sizeInBytes')._jdf.showString(20, 0, True)
      else:
        output_str = output_str+' .. no changes after optimization.'

    spark.sql(f'VACUUM delta.`{delta_table_path}` RETAIN {duration} HOURS')
    
    print(output_str)
  except Exception as e:
    print('Other exception::',delta_table_path,str(e))
    return

  #Add latest details to deltas_df
  newRow = spark.createDataFrame([(df.select('id').first()[0], df.select('location').first()[0], folder_name, str(df.select('createdAt').first()[0]), str(df.select('lastModified').first()[0]), df.select('numFiles').first()[0], df.select('partitionColumns').first()[0], df.select('sizeInBytes').first()[0])], schema)
  deltas_df = deltas_df.union(newRow)

In [None]:
folder_path_list_gold = [folder.path for folder in
        mssparkutils.fs.ls(f"{gold_adls_path}") + 
        mssparkutils.fs.ls(f"{gold_adls_path}AS400") +
        mssparkutils.fs.ls(f"{gold_adls_path}b2b2c") +
        mssparkutils.fs.ls(f"{gold_adls_path}cco") + 
        mssparkutils.fs.ls(f"{gold_adls_path}GA4") + 
        mssparkutils.fs.ls(f"{gold_adls_path}mParticle") + 
        mssparkutils.fs.ls(f"{gold_adls_path}SAP/BW") +
        mssparkutils.fs.ls(f"{gold_adls_path}TransportationInsights") + 
        mssparkutils.fs.ls(f"{gold_adls_path}vi3")
        if folder.size==0]
#folder_path_list_gold.remove(f"{gold_adls_path}NewStore")
folder_path_list_silver = [folder.path for folder in
        mssparkutils.fs.ls(f"{silver_adls_path}EMWBIS") + 
        mssparkutils.fs.ls(f"{silver_adls_path}mParticle") +
        mssparkutils.fs.ls(f"{silver_adls_path}NewStore/merrell") +
        mssparkutils.fs.ls(f"{silver_adls_path}NewStore/saucony") +
        mssparkutils.fs.ls(f"{silver_adls_path}NewStore/sweatybetty") +
        mssparkutils.fs.ls(f"{silver_adls_path}NewStore/wolverineusa") +
        mssparkutils.fs.ls(f"{silver_adls_path}SAP/AFS") +
        mssparkutils.fs.ls(f"{silver_adls_path}SAP/BW") +
        mssparkutils.fs.ls(f"{silver_adls_path}StoreTech")
        if folder.size==0]
folder_path_list_bronze = [folder.path for folder in
        mssparkutils.fs.ls(f"{bronze_adls_path}") +
        mssparkutils.fs.ls(f"{bronze_adls_path}NewStore/merrell") +
        mssparkutils.fs.ls(f"{bronze_adls_path}NewStore/saucony") +
        mssparkutils.fs.ls(f"{bronze_adls_path}NewStore/sweatybetty") +
        mssparkutils.fs.ls(f"{bronze_adls_path}NewStore/wolverineusa") +
        mssparkutils.fs.ls(f"{bronze_adls_path}SAP/AFS") +
        mssparkutils.fs.ls(f"{bronze_adls_path}SAP/BW") +
        mssparkutils.fs.ls(f"{bronze_adls_path}SAP/Retail") +
        mssparkutils.fs.ls(f"{bronze_adls_path}StoreTech") +
        mssparkutils.fs.ls(f"{bronze_adls_path}TransportationInsights") +
        mssparkutils.fs.ls(f"{bronze_adls_path}vi3")
        if folder.size==0]
folder_path_list_raw = [folder.path for folder in
        mssparkutils.fs.ls(f"{raw_adls_path}SAP/BW/")
        if folder.size==0]

folder_path_list = folder_path_list_gold + folder_path_list_silver + folder_path_list_bronze + folder_path_list_raw
#print(folder_path_list_silver)
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = list(executor.map(optimizeDeltaTable, folder_path_list))

In [None]:
#Save deltas_df to Delta
deltas_df.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(f'{bronze_adls_path}Synapse/DeltaInventory')