In [0]:
dbutils.fs.ls('mnt/silver/SalesLT/')

[FileInfo(path='dbfs:/mnt/silver/SalesLT/Address/', name='Address/', size=0, modificationTime=1735035863000),
 FileInfo(path='dbfs:/mnt/silver/SalesLT/Customer/', name='Customer/', size=0, modificationTime=1735035873000),
 FileInfo(path='dbfs:/mnt/silver/SalesLT/CustomerAddress/', name='CustomerAddress/', size=0, modificationTime=1735035876000),
 FileInfo(path='dbfs:/mnt/silver/SalesLT/Product/', name='Product/', size=0, modificationTime=1735035878000),
 FileInfo(path='dbfs:/mnt/silver/SalesLT/ProductCategory/', name='ProductCategory/', size=0, modificationTime=1735035881000),
 FileInfo(path='dbfs:/mnt/silver/SalesLT/ProductDescription/', name='ProductDescription/', size=0, modificationTime=1735035883000),
 FileInfo(path='dbfs:/mnt/silver/SalesLT/ProductModel/', name='ProductModel/', size=0, modificationTime=1735035885000),
 FileInfo(path='dbfs:/mnt/silver/SalesLT/ProductModelProductDescription/', name='ProductModelProductDescription/', size=0, modificationTime=1735035887000),
 FileInf

In [0]:
df_address = spark.read.format('delta').load('dbfs:/mnt/silver/SalesLT/Address/')
df_address.limit(10).display()

AddressID,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,rowguid,ModifiedDate
9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268af621-76d7-4c78-9441-144fd139821a,2006-07-01
11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981b3303-aca2-49c7-9a96-fb670785b269,2007-04-01
25,9178 Jumping St.,,Dallas,Texas,United States,75201,c8df3bd9-48f0-4654-a8dd-14a67a84d3c6,2006-09-01
28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12ae5ee1-fc3e-468b-9b92-3b970b169774,2005-09-01
32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84a95f62-3ae8-4e7e-bbd5-5a6f00cd982d,2006-08-01
185,2681 Eagle Peak,,Bellevue,Washington,United States,98004,7bccf442-2268-46cc-8472-14c44c14e98c,2006-09-01
297,7943 Walnut Ave,,Renton,Washington,United States,98055,52410da4-2778-4b1d-a599-95746625ce6d,2006-08-01
445,6388 Lake City Way,,Burnaby,British Columbia,Canada,V5A 3A6,53572f25-9133-4a8b-a065-102ff35416ee,2006-09-01
446,52560 Free Street,,Toronto,Ontario,Canada,M4B 1V7,801a1dfc-5125-486b-aa84-ccbd2ec57ca4,2005-08-01
447,22580 Free Street,,Toronto,Ontario,Canada,M4B 1V7,88cee379-dbb8-433b-b84e-a35e09435500,2006-08-01


### Lets create function to change column names from camelCase to snake_case

In [0]:
from pyspark.sql.functions import col

def rename_columns_to_snake_case(df):
    """
    Convert column names from PascalCase or camelCase to snake_case in a PySpark DataFrame.

    Args:
        df (DataFrame): The input DataFrame with columns to be renamed.

    Returns:
        DataFrame: A new DataFrame with column names converted to snake_case.
    """
    # Get the list of column names
    column_names = df.columns

    # Dictionary to hold old and new column name mappings
    rename_map = {}

    for old_col_name in column_names:
        # Convert column name from PascalCase or camelCase to snake_case
        new_col_name = "".join([
            "_" + char.lower() if (
                char.isupper()              # Check if the current character is uppercase
                and idx > 0                 # Ensure it's not the first character
                and not old_col_name[idx - 1].isupper()  # Ensure the previous character is not uppercase
            ) else char.lower()  # Convert character to lowercase
            for idx, char in enumerate(old_col_name)
        ]).lstrip("_")  # Remove any leading underscore

        # Avoid renaming to an existing column name
        if new_col_name in rename_map.values():
            raise ValueError(f"Duplicate column name found after renaming: '{new_col_name}'")

        # Map the old column name to the new column name
        rename_map[old_col_name] = new_col_name

    # Rename columns using the mapping
    for old_col_name, new_col_name in rename_map.items():
        df = df.withColumnRenamed(old_col_name, new_col_name)

    return df

In [0]:
df_address = rename_columns_to_snake_case(df_address)
df_address.limit(10).display()

address_id,address_line1,address_line2,city,state_province,country_region,postal_code,rowguid,modified_date
9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268af621-76d7-4c78-9441-144fd139821a,2006-07-01
11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981b3303-aca2-49c7-9a96-fb670785b269,2007-04-01
25,9178 Jumping St.,,Dallas,Texas,United States,75201,c8df3bd9-48f0-4654-a8dd-14a67a84d3c6,2006-09-01
28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12ae5ee1-fc3e-468b-9b92-3b970b169774,2005-09-01
32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84a95f62-3ae8-4e7e-bbd5-5a6f00cd982d,2006-08-01
185,2681 Eagle Peak,,Bellevue,Washington,United States,98004,7bccf442-2268-46cc-8472-14c44c14e98c,2006-09-01
297,7943 Walnut Ave,,Renton,Washington,United States,98055,52410da4-2778-4b1d-a599-95746625ce6d,2006-08-01
445,6388 Lake City Way,,Burnaby,British Columbia,Canada,V5A 3A6,53572f25-9133-4a8b-a065-102ff35416ee,2006-09-01
446,52560 Free Street,,Toronto,Ontario,Canada,M4B 1V7,801a1dfc-5125-486b-aa84-ccbd2ec57ca4,2005-08-01
447,22580 Free Street,,Toronto,Ontario,Canada,M4B 1V7,88cee379-dbb8-433b-b84e-a35e09435500,2006-08-01


### Applying this transformation on all tables

In [0]:
# To show the basic format of ls
table_name = []

for i in dbutils.fs.ls('mnt/silver/SalesLT'):
    table_name.append(i.name.split('/')[0])

table_name

['Address',
 'Customer',
 'CustomerAddress',
 'Product',
 'ProductCategory',
 'ProductDescription',
 'ProductModel',
 'ProductModelProductDescription',
 'SalesOrderDetail',
 'SalesOrderHeader']

In [0]:
# Iterate through each table name in the list
for i in table_name:
    # Construct the input path for the delta file
    path = "dbfs:/mnt/silver/SalesLT/" + i + "/"
    print(f"Renaming columns in {i}")
    # Read the delta file into a Spark DataFrame
    df = spark.read.format('delta').load(path)
    
    # Apply the renaming function on the dataframe
    df = rename_columns_to_snake_case(df)
    
    # Construct the output path in the gold layer
    output_path = '/mnt/gold/SalesLT/' + i + '/'
    
    # Write the transformed DataFrame to Delta format
    # 'overwrite' mode will replace any existing data in the destination
    df.write.format('delta').mode('overwrite').save(output_path)

Renaming columns in Address
Renaming columns in Customer
Renaming columns in CustomerAddress
Renaming columns in Product
Renaming columns in ProductCategory
Renaming columns in ProductDescription
Renaming columns in ProductModel
Renaming columns in ProductModelProductDescription
Renaming columns in SalesOrderDetail
Renaming columns in SalesOrderHeader
