# Imports

In [10]:
#Read new traffic data via API, store in raw, process as Delta table
import delta #from delta import *
import pyspark.sql.functions  #from pyspark.sql.functions import *
import pyspark.sql.types #from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, LongType
import datetime
import requests, json
import azure.storage.blob #from azure.storage.blob import BlobServiceClient
import notebookutils
from pyspark.sql import SparkSession
from py4j.java_gateway import JavaObject

# Includes

In [11]:
%run /utils/environment

# Bronze - Conform to Delta

In [12]:
#Read the csv into a data frame
file_path = f'{raw_adls_path}/ExternalData/GL/accountlist1.csv'
print(file_path)

df = spark.read.option("quote", "\"").option("escape", "\"").option("header", "true").csv(file_path,
    header=True,
    inferSchema=True
)

df.printSchema()
df.show(5)

# Cleaning Column Names

In [13]:
import re

# Get the current column names
old_columns = df.columns

# Create a list for the new column names
new_columns = []

# Process each column name
for column in old_columns:
    # Replace spaces with underscores and remove special characters (keep letters, numbers, and underscores)
    clean_name = re.sub(r'[^\w]', '', column.replace(' ', '_'))
    
    # Ensure the name doesn't start with a number (add prefix if needed)
    if clean_name and clean_name[0].isdigit():
        clean_name = 'col_' + clean_name
    
    # Handle empty string case
    if not clean_name:
        clean_name = 'column_' + str(len(new_columns))
        
    new_columns.append(clean_name)

# Rename the columns in the dataframe
df = df.toDF(*new_columns)

# Print the before and after column names for verification
print("Original columns:", old_columns)
print("Cleaned columns:", new_columns)

df.printSchema()

# Writing data to delta table and saving in Bronze

In [14]:
#Write dataframe to bronze
bronze_delta_table_path = f'{bronze_adls_path}/ExternalData/GL/AccountList'
print(bronze_delta_table_path)

df.write.option("overwriteSchema", "true").mode("overwrite").format("delta").save(bronze_delta_table_path)

In [15]:
print(df)

# Silver Processing

In [16]:
# Read the existing Delta table from bronze layer
bronze_path = f"{bronze_adls_path}/ExternalData/GL/AccountList"
bronze_df = spark.read.format("delta").load(bronze_path)

# Display the schema to confirm current data types
print("Bronze schema:")
bronze_df.printSchema()

# Import necessary functions
from pyspark.sql.functions import col

# Transform data - convert specific columns from string to integer
silver_df = bronze_df # bronze_df.drop("_c8")

# Display the new schema to confirm the type conversions
print("Silver schema:")
silver_df.printSchema()

# Define the silver layer path
silver_path = f"{silver_adls_path}/ExternalData/GL/AccountList"


# Writing to Silver

In [17]:

# Write the transformed data to the silver layer as a Delta table
silver_df.write.option("overwriteSchema", "true") \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path)

print(f"Silver Delta table successfully written to: {silver_path}")

In [18]:
notebookutils.mssparkutils.notebook.exit(0)

In [None]:
# schema = pyspark.sql.types.StructType([
#     pyspark.sql.types.StructField('Key', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('Global_GL_Account', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('WWW_P&L', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('CorporateSortOrder', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('CorporateFormat_String', pyspark.sql.types.LongType(), True),
#     pyspark.sql.types.StructField('Retail_P&L', pyspark.sql.types.LongType(), True),
#     pyspark.sql.types.StructField('RetailFormatString', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('RetailSortOrder', pyspark.sql.types.LongType(), True),
#     pyspark.sql.types.StructField('Sweaty_Betty_P&L', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('SweatyBettyFormat', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('SweatyBettyOrder', pyspark.sql.types.LongType(), True),
#     pyspark.sql.types.StructField('Other_SB_P&L', pyspark.sql.types.LongType(), True),
#     pyspark.sql.types.StructField('OtherSPFormat', pyspark.sql.types.StringType(), True),
#     pyspark.sql.types.StructField('OtherSPOrder', pyspark.sql.types.LongType(), True)
# ])

In [None]:
from pyspark.sql.functions import col

# Filter the rows based on specific values in column 1 and select columns 1, 4, and 8
silver_df.filter(col(df.columns[0]).isin('0000700276_B', '0000700276_D', '0000700276_V', '0000700276_W')) \
  .select(df.columns[0], df.columns[3], df.columns[7]) \
  .show()


# Check for folder

In [None]:
# Function to tell us if a folder exists
def folder_exists(abfss_path):
    from notebookutils import mssparkutils
    try:
        mssparkutils.fs.ls(abfss_path)
        return True
    except:
        return False

# Function to create a folder

In [None]:
#Creating a folder:
mssparkutils.fs.mkdirs(f'{raw_adls_path}/ExternalData/GL')

In [None]:
#Creating remaining folders for medallion architecture for Dev:
mssparkutils.fs.mkdirs(f'{bronze_adls_path}/ExternalData/GL')
mssparkutils.fs.mkdirs(f'{silver_adls_path}/ExternalData/GL')
mssparkutils.fs.mkdirs(f'{gold_adls_path}/ExternalData/GL')
#silver and gold do not have ExternalData folders

In [None]:
#Creating raw GL folder in ExternalData for Test
mssparkutils.fs.mkdirs('abfss://raw@azwwwnonprodtestadapadls.dfs.core.windows.net/ExternalData/GL')
mssparkutils.fs.mkdirs('abfss://bronze@azwwwnonprodtestadapadls.dfs.core.windows.net/ExternalData/GL')



In [None]:
#Creating raw and bronze GL folders in ExternalData for Prod
mssparkutils.fs.mkdirs('abfss://raw@azwwwprodprdadapadls.dfs.core.windows.net/ExternalData/GL')
mssparkutils.fs.mkdirs('abfss://bronze@azwwwprodprdadapadls.dfs.core.windows.net/ExternalData/GL')

In [None]:
# Query of Silver Delta Table
df = spark.read.format("delta").load("abfss://silver@azwwwnonproddevadapadls.dfs.core.windows.net/ExternalData/GL/AccountList")
df.createOrReplaceTempView("delta_table")

In [None]:
from pyspark.sql.functions import col

# Filter the rows based on specific values in column 1 and select columns 1, 4, and 8
bronze_df.show()
