In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
import sys,json,yaml,os
sys.path.append("/Workspace/Users/mayur10594@gmail.com/ETL_project")
from config.utils import get_logger,load_config,load_all_schemas,load_sqls
logger = get_logger("dataloadinitiate")
base_path='/Workspace/Users/mayur10594@gmail.com/ETL_project/'
from gold_transformations.debit_transactions_extract import *
from config.utils import run_scd_type2

In [0]:
dbutils.widgets.text("fileList", "")
dbutils.widgets.text("taxYear", "")
dbutils.widgets.text("clientId", "")
dbutils.widgets.text("env","")
dbutils.widgets.text("bronze_path", "")
dbutils.widgets.text("maps","")
fileList = dbutils.widgets.get("fileList")
taxYear = dbutils.widgets.get("taxYear")
clientId = dbutils.widgets.get("clientId")
env = dbutils.widgets.get("env")
bronze_path = dbutils.widgets.get("bronze_path")
maps=json.loads(dbutils.widgets.get("maps"))
logger.info(f"fileList: {fileList}, taxYear: {taxYear}, clientId: {clientId}, env: {env}, bronze_path: {bronze_path}, maps to be loaded: {maps}")

In [0]:
# Dictionary to hold DataFrames
dfs = {}
# create dataframe for all the maps
for map_name in maps:
    table=f"cp_database.{clientId}_{map_name}"
    df = spark.read.table(table)
    dfs[map_name] = df
    #print(f"df_{map_name} is created and stored in dfs['{map_name}']")
    logger.info(f"df_{map_name} is created and stored in dfs['{map_name}']")

In [0]:
env_config=load_config(f"{base_path}/config/etl_main.yaml",env)
sqls_config=load_config(f"{base_path}/config/gold_config.yaml",env)

In [0]:
config_path='/Workspace/Users/mayur10594@gmail.com/ETL_project/config/gold_config.yaml'
with open(config_path, "r") as f:
    sqls_cfg = yaml.safe_load(f)

In [0]:
from pathlib import Path
import importlib.util

In [0]:
path = '/Workspace/Users/mayur10594@gmail.com/ETL_project/gold_transformations'
extracts = []
df_ext={}
for map in maps:
    dfs[map].createOrReplaceTempView(map)
    print(f"✅ Temp view created for {map}")

for map in maps:
    # Register input DF as temp view
    # dfs[map].createOrReplaceTempView(map)
    # print(f"✅ Temp view created for {map}")
    print(f"▶️ Started extract for {map}")

    sqls = sqls_config[map].get('sqls',[])
    transformations=sqls_config[map].get('transformations',[])
    trans_inputs=sqls_config[map].get('trans_inputs',[])
    input_dfs = [dfs[name] for name in trans_inputs]
    for sql_file in sqls:
        df_name = f"{map}_{sql_file.split('.')[0]}"   # string name
        extracts.append(df_name)
        print(f"▶️ Creating {df_name} dataframe")
        print(f"▶️ Running SQL for {map}: {sql_file}")
        # Load the SQL text
        sql_query = load_sqls(f"{base_path}gold_transformations", sql_file)
        print(sql_query)
        # Run SQL and store DF in dictionary
        df_ext[df_name] = spark.sql(sql_query)
        df_ext[df_name].write.format("parquet").mode('overwrite').save(f"{env_config['gold_path']}extracts/{map}/{sql_file}")
    for trans_file in transformations:
        print(f"▶️ Running transformation for {map}: {trans_file}")
        df_name = f"{map}_{trans_file.split('.')[0]}"   # string name
        extracts.append(df_name)
        print(f"▶️ Creating {df_name} dataframe")
        print(f"▶️ Running transformation for {map}: {trans_file}")
        trans_folder = Path("../gold_transformations/")
        # Transformation filename from YAML
        script_path = Path(trans_folder / trans_file)
        spec = importlib.util.spec_from_file_location(script_path.stem, script_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        # Get function object (assume function name = filename without .py)
        func = getattr(module, script_path.stem)
        df_ext[df_name]=func(*input_dfs)
        df_ext[df_name].write.format("parquet").mode('overwrite').save(f"{env_config['gold_path']}extracts/{map}/{trans_file}")
 

print("✅ Extracts created:", extracts)


In [0]:
df_ext['users_users_extract']=df_ext['users_users_extract'].union(df_new)

In [0]:
for name,df in df_ext.items():
    if (name.split("_")[0]=='cards') & (name.split("_")[1]=='debit'):
        print("this is for CardsDebit table")
        df_ext[name].write.mode("overwrite").saveAsTable(f"gold_database.{clientId}_CardsDebit")
    elif (name.split("_")[0]=='cards') & (name.split("_")[1]=='credit'):
        print("this is for CardsCredit table")
        df_ext[name].write.mode("overwrite").saveAsTable(f"gold_database.{clientId}_CardsCredit")
    elif (name.split("_")[0]=='transactions') & (name.split("_")[1]=='debit'):
        print("this is for TransDataDebit table")
        df_ext[name].write.mode("append").saveAsTable(f"gold_database.{clientId}_TransDataDebit")
    elif (name.split("_")[0]=='transactions') & (name.split("_")[1]=='credit'):
        print("this is for TransDataCredit table")
        df_ext[name].write.mode("append").saveAsTable(f"gold_database.{clientId}_TransDataCredit")
    elif (name.split("_")[0]=='transactions') & (name.split("_")[1]=='transactions'):
        print("this is for TransData table")
        df_ext[name].write.mode("append").saveAsTable(f"gold_database.{clientId}_TransData")
    elif name.split("_")[0]=='users':
        print("this is for Users table")
        run_scd_type2(
            spark,
            df_source=df_ext[name],
            target_table_path=f"gold_database.{clientId}_users",
            config_path="/Workspace/Users/mayur10594@gmail.com/ETL_project/config/scd_config.yaml"
            )
        # df_ext[name].write.mode("overwrite").saveAsTable(f"gold_database.{clientId}_Users")
    else:
        print("this is for something else, can not proceed for this dataframe")
print("All dataframes are written to gold_database")