In [0]:
import yaml
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import IntegerType, FloatType, DateType
from pyspark.sql.functions import col
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from libs.logger import log_execution
from pyspark.sql import types as T


# --- 1. Environment Configuration ---
try:
    # Get environment variable from Databricks widget or set default
    ENV = dbutils.widgets.get("env_name")
except Exception:
    ENV = 'TEST'
    
# --- 2. Load Configuration ---
try:
    # Load configuration from the YAML file
    with open('../../config/config.yaml', 'r') as file:
        full_config = yaml.safe_load(file)
except FileNotFoundError:
    print("ERROR: 'config.yaml' file not found! Check the path.")
    raise

CFG = full_config.get(ENV)
if not CFG:
    raise ValueError(f"Configuration not found for environment: {ENV} in YAML file.")

catalog_name = CFG['catalog_name']
schema_name = CFG['schema_name']
volume_name = CFG['volume_name']

# Paths
LOGS_PATH = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/silver_execution_logs/"
base_bronze_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/yfinance_bronze_data"
silver_table_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/yfinance_silver_data"

In [0]:
#schema from bronze
bronze_input_schema = T.StructType([
    T.StructField("Date", T.DateType(), True),
    T.StructField("Open", T.DoubleType(), True),
    T.StructField("High", T.DoubleType(), True),
    T.StructField("Low", T.DoubleType(), True),
    T.StructField("Close", T.DoubleType(), True),
    T.StructField("Volume", T.LongType(), True),
    T.StructField("Ticket", T.StringType(), True),
    T.StructField("company_name", T.StringType(), True)
])
try:
    print(f"--- STARTING SILVER LAYER PROCESS (Mode: Full Overwrite) ---")
    log_execution(spark, "02_SILVER_TRANSFORMATION", "STARTED", LOGS_PATH)

    # --- 3. BRONZE LAYER: Read & Type Optimization ---
    df_bronze = (
        spark.read.format("delta")
        .schema(bronze_input_schema) 
        .load(base_bronze_path)
    )

    df_optimized = (
        df_bronze
        .withColumn("Date", col("Date").cast(DateType()))
        .withColumn("Open", col("Open").cast(FloatType()))
        .withColumn("High", col("High").cast(FloatType()))
        .withColumn("Low", col("Low").cast(FloatType()))
        .withColumn("Close", col("Close").cast(FloatType()))
        .withColumn("Volume", col("Volume").cast(IntegerType())) 
    )

    # --- 4. DATA QUALITY CHECKS ---
    
    # 1. Critical Null Check
    critical_null_count = df_optimized.filter(
        F.col("Date").isNull() | F.col("Ticket").isNull() | F.col("Close").isNull()
    ).count()

    if critical_null_count > 0:
        raise ValueError(f"QA ERROR: Found {critical_null_count} rows with critical NULLs. Pipeline halted.")

    # 2. Duplicate Check & Removal
    total_rows = df_optimized.count()
    df_unique = df_optimized.dropDuplicates(subset=["Date", "Ticket"])
    unique_rows = df_unique.count()

    if total_rows != unique_rows:
        print(f"WARNING: Removed {total_rows - unique_rows} duplicates.")

    # --- 5. SILVER LAYER: Feature Engineering ---
    PRECISION = 4
    SMA_PERIODS = [20, 50, 200]
    window_spec = Window.partitionBy("Ticket").orderBy("Date")

    # A) Daily Returns
    df_silver = df_unique.withColumn(
        "Previous_Close", F.lag(F.col("Close"), 1).over(window_spec)
    ).withColumn(
        "Daily_Return_Pct", 
        F.round(((F.col("Close") - F.col("Previous_Close")) / F.col("Previous_Close")) * 100, PRECISION).cast(FloatType())
    ).drop("Previous_Close")

    # B) Time Dimensions
    df_silver = (
        df_silver
        .withColumn("Year", F.year(F.col("Date")))
        .withColumn("Quarter", F.quarter(F.col("Date")))
        .withColumn("Month", F.month(F.col("Date")))
        .withColumn("WeekOfYear", F.weekofyear(F.col("Date")))
    )

    # C) Simple Moving Averages (SMA)
    for N in SMA_PERIODS:
        window_sma = window_spec.rowsBetween(-(N - 1), 0)
        df_silver = df_silver.withColumn(
            f"SMA_{N}",
            F.avg(F.col("Close")).over(window_sma).cast(FloatType())
        )

    # --- 6. WRITE TO DELTA LAKE (Full Overwrite) ---
    (
        df_silver.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("Ticket") # Optymalizacja pod odczyt konkretnych spółek
        .option("overwriteSchema", "true")
        .save(silver_table_path)
    )

    # --- 7. POST-WRITE OPTIMIZATION & CONSTRAINTS ---
    

    spark.sql(f"OPTIMIZE delta.`{silver_table_path}` ZORDER BY (Date)")
    spark.sql(f"VACUUM delta.`{silver_table_path}` RETAIN 168 HOURS")
    print(f"Successfully saved data to Silver layer at: {silver_table_path}")
    log_execution(spark, "02_SILVER_TRANSFORMATION", "SUCCESS", LOGS_PATH)

except Exception as e:
    error_msg = str(e)[:500]
    log_execution(spark, "02_SILVER_TRANSFORMATION", "FAILED", LOGS_PATH, message=error_msg)
    print(f"FATAL ERROR in Silver process: {e}")
    raise e