In [0]:
import sys
sys.path.append("/Workspace/Users/mayur10594@gmail.com/ETL_project")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from config.utils import get_logger
logger = get_logger("bronze")

In [0]:
dbutils.widgets.text("fileList", "")
dbutils.widgets.text("taxYear", "")
dbutils.widgets.text("clientId", "")
dbutils.widgets.text("env","")
fileList = dbutils.widgets.get("fileList")
taxYear = dbutils.widgets.get("taxYear")
clientId = dbutils.widgets.get("clientId")
env = dbutils.widgets.get("env")
logger.info(f"fileList: {fileList}, taxYear: {taxYear}, clientId: {clientId}, env: {env}")


In [0]:
import json, yaml, os
from config.utils import load_config,create_tables,load_all_schemas

In [0]:
schemas_map = load_all_schemas()
print("All schemas loaded:", schemas_map.keys())

In [0]:
path="/Workspace/Users/mayur10594@gmail.com/ETL_project/config/etl_main.yaml"
env_config,tables_config=load_config(path,env)

In [0]:
in_path=env_config['incoming_path']
bronze_path=env_config['bronze_path']
logger.info(f"Loading data from {in_path}")
logger.info(f"bronze_path is set to {bronze_path}")
logger.info(f"env is set to {env}")
logger.info(f"taxYear is set to {taxYear}")
logger.info(f"clientId is set to {clientId}")

In [0]:
#creating list of files
if "," in fileList:
    files=list(fileList.split(","))
    logger.info(f"files to be processed: {fileList}")
else:
    files=[fileList]
    logger.info(f"only single file to be processed: {fileList}")

In [0]:
maps=[]
for file in files:
    maps.append(file.split("_")[0])
    #print(file)
logger.info(f"maps to be processed are {maps}")

In [0]:
path="/Workspace/Users/mayur10594@gmail.com/ETL_project/config/etl_main.yaml"
env_config,tables_config=load_config(path,env)

In [0]:
in_path=env_config['incoming_path']

In [0]:
#check if files exists in incoming folder else exit the notebook
in_files=[f.name for f in dbutils.fs.ls(in_path)]
for file in files:
        if file in in_files:
            #print(f"File {file} exists")
            logger.info(f"File {file} exists")
        else:
            #print(f"File {file} does not exist in incoming folder, Please check & re-run again")
            logger.error(f"File {file} does not exist in incoming folder, Please check & re-run again")
            dbutils.notebook.exit("Missing File")
        

In [0]:
# Dictionary to hold DataFrames
dfs = {}

for map_name in maps:
    # Build CSV path
    csv_path = f"{in_path}/{map_name}_data.csv"
    schema = schemas_map[f"{map_name}.{map_name}"]
    # Evaluate schema string (assumes map_name_schema exists as a variable)
    #schema = eval(f"{map_name}_schema")
    # Read CSV into DataFrame
    df = spark.read.csv(csv_path,schema=schema,header=True)
    #df=df.transform(add_audit_cols)
    # Store in dictionary
    dfs[map_name] = df
    #print(f"df_{map_name} is created and stored in dfs['{map_name}']")
    logger.info(f"df_{map_name} is created and stored in dfs['{map_name}']")


In [0]:
from datetime import date
date_str = date.today().strftime("%Y-%m-%d")

In [0]:
bronze_out_path = f"{bronze_path}/{taxYear}/{clientId}/{date_str}"

In [0]:
for name,df in dfs.items():
    count={}
    df.write.format("parquet").options(mergeSchema=True).mode('overwrite').save(f"{bronze_out_path}/{name}_data")
    count[name]=df.count()
    #print(f"{df} written to delta table dbfs.{clientId}_{name}")
    logger.info(f"{df} saved as parquet file at {bronze_out_path}/{name}_data")



In [0]:
logger.info(f"exiting the bronze notebook")
dbutils.notebook.exit(json.dumps({
    "status": "OK",
    "message": "All files processed successfully",
    "counts": count,
    "bronze_path": bronze_out_path,
    "maps":maps
}))