In [0]:
# Import necessary libraries
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import os

# --- 1. Environment Configuration and Config Loading ---

# Environment variable (assumed to be set by Job or widget)
try:
    ENV = dbutils.widgets.get("env_name")
except Exception:
    ENV = 'TEST' # Default environment

# Load YAML Configuration
try:
    # Adjust '../../config/config.yaml' path to the actual location if needed
    with open('../../config/config.yaml', 'r') as file:
        full_config = yaml.safe_load(file)
except FileNotFoundError:
    print("ERROR: 'config.yaml' file not found! Check the path.")
    raise

CFG = full_config.get(ENV)
if not CFG:
    raise ValueError(f"Configuration not found for environment: {ENV} in YAML file.")

catalog_name = CFG['catalog_name']
schema_name = CFG['schema_name']
volume_name = CFG['volume_name']

# Precision constant used in the Silver layer
PRECISION = 4 

# --- 2. Data Path Definition ---

silver_table_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/yfinance_silver_data"
gold_table_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/yfinance_gold_data"

print(f"Source Path (Silver): {silver_table_path}")
print(f"Target Path (Gold): {gold_table_path}")
print("-" * 50)

In [0]:
# --- 3. Load Data from Silver Layer ---

try:
    # Always read data from the persistent layer to ensure pipeline independence
    df_silver = (
        spark.read.format("delta")
        .load(silver_table_path)
    )
    print(f"Successfully loaded data from the Silver layer. Row count: {df_silver.count()}")
except Exception as e:
    print(f"CRITICAL ERROR: Failed to load data from Silver. Gold pipeline cannot proceed. Details: {e}")
    raise e

In [0]:
# Define the monthly aggregation
df_gold = (
    df_silver
    .groupBy("Ticket", "Year", "Month")
    .agg(
        # Price Metrics
        F.max(F.col("Close")).alias("Monthly_Max_Close"),
        F.min(F.col("Close")).alias("Monthly_Min_Close"),
        
        # Volume Metrics
        F.round(F.avg(F.col("Volume")), 0).cast(IntegerType()).alias("Monthly_Avg_Volume"),
        F.sum(F.col("Volume")).alias("Monthly_Total_Volume"),
        
        # Return Metrics (using the predefined PRECISION)
        F.round(F.avg(F.col("Daily_Return_Pct")), PRECISION).cast(FloatType()).alias("Monthly_Avg_Daily_Return_Pct")
    )
    .orderBy("Ticket", "Year", "Month")
)

print("Successfully created the aggregated DataFrame for the Gold layer (Monthly Aggregation).")

In [0]:
# --- 5. Write Data to the Gold Layer ---

# Ensure the directory exists (idempotent operation)
try:
    dbutils.fs.mkdirs(gold_table_path)
except Exception:
    pass # Directory already exists

# Write in 'overwrite' mode with partitioning
(
    df_gold.write
    .format("delta")
    .mode("overwrite")
    .partitionBy("Ticket", "Year") # Partitioning for optimized analytical queries
    .option("overwriteSchema", "true")
    .save(gold_table_path)
)

print("-" * 50)
print(f"âœ… SUCCESS: Gold layer data saved to: {gold_table_path}")
print("Gold pipeline finished.")