In [0]:
# Import necessary libraries
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, StructType, StructField, StringType, LongType
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import os
from libs.logger import log_execution

# --- 1. Environment Configuration and Config Loading ---

# Environment variable (assumed to be set by Job or widget)
try:
    ENV = dbutils.widgets.get("env_name")
except Exception:
    ENV = 'TEST' # Default environment

# Load YAML Configuration
try:
    # Adjust '../../config/config.yaml' path to the actual location if needed
    with open('../../config/config.yaml', 'r') as file:
        full_config = yaml.safe_load(file)
except FileNotFoundError:
    print("ERROR: 'config.yaml' file not found! Check the path.")
    raise

CFG = full_config.get(ENV)
if not CFG:
    raise ValueError(f"Configuration not found for environment: {ENV} in YAML file.")

# --- 2. Data Path Definition ---
catalog_name = CFG['catalog_name']
schema_name = CFG['schema_name']
volume_name = CFG['volume_name']

# Precision constant used in the Silver layer
PRECISION = 4 

silver_table_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/yfinance_silver_data"
gold_table_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/yfinance_gold_data"
LOGS_PATH = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/gold_execution_logs/"

print(f"Source Path (Silver): {silver_table_path}")
print(f"Target Path (Gold): {gold_table_path}")
print("-" * 50)

In [0]:
# --- 3. Process Initialization ---
try:
    print(f"--- STARTING GOLD LAYER PROCESS (Mode: Monthly Aggregation) ---")
    log_execution(spark, "03_GOLD_TRANSFORMATION", "STARTED", LOGS_PATH)

    # --- 4. Read Silver ---
    df_silver = spark.read.format("delta").load(silver_table_path)

    # --- 5. Transformation (Business Logic) ---
    df_gold = (
        df_silver
        .groupBy("Ticket", "company_name", "Year", "Month")
        .agg(
            F.max(F.col("Close")).alias("Monthly_Max_Close"),
            F.min(F.col("Close")).alias("Monthly_Min_Close"),
            F.round(F.avg(F.col("Volume")), 0).cast(LongType()).alias("Monthly_Avg_Volume"),
            F.sum(F.col("Volume")).alias("Monthly_Total_Volume"),
            F.round(F.avg(F.col("Daily_Return_Pct")), PRECISION).cast(FloatType()).alias("Monthly_Avg_Daily_Return_Pct")
        )
    )

    # --- 6. Write to Gold with Optimization ---
    table_exists = os.path.exists(f"{gold_table_path}/_delta_log")

    (
        df_gold.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("Ticket")
        .option("overwriteSchema", "true")
        .save(gold_table_path)
    )

    # --- 7. Post-Write: Constraints & Optimization ---
    
    if not table_exists:
        print("Applying Quality Constraints to Gold Table...")
        spark.sql(f"ALTER TABLE delta.`{gold_table_path}` ADD CONSTRAINT gold_month_check CHECK (Month >= 1 AND Month <= 12)")
        spark.sql(f"ALTER TABLE delta.`{gold_table_path}` ADD CONSTRAINT gold_price_check CHECK (Monthly_Max_Close >= Monthly_Min_Close)")
        spark.sql(f"ALTER TABLE delta.`{gold_table_path}` ADD CONSTRAINT gold_ticket_not_null CHECK (Ticket IS NOT NULL)")

    # Optimization
    print("Optimizing Gold table (Z-ORDER)...")
    spark.sql(f"OPTIMIZE delta.`{gold_table_path}` ZORDER BY (Year, Month)")

    #VACUUM
    print("Running VACUUM...")
    spark.sql(f"VACUUM delta.`{gold_table_path}` RETAIN 168 HOURS")

    log_execution(spark, "03_GOLD_TRANSFORMATION", "SUCCESS", LOGS_PATH)
    print(f"âœ… SUCCESS: Gold layer processing finished.")

except Exception as e:
    error_msg = str(e)[:500]
    log_execution(spark, "03_GOLD_TRANSFORMATION", "FAILED", LOGS_PATH, message=error_msg)
    print(f"FATAL ERROR in Gold process: {e}")
    raise e