In [0]:
%pip install -r ../../requirements.txt
%restart_python

In [0]:
import yfinance as yf
from datetime import datetime, date
import os
import time
from pyspark.sql.functions import lit
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql import functions as F
from pyspark.sql.types import DateType, TimestampType
import yaml
from pyspark.sql.functions import to_date, col
# --- DELTA LAKE IMPORT ---
from delta.tables import DeltaTable
from pyspark.sql import types as T
#logger
from libs.logger import log_execution

# --- 1. Environment Configuration ---
try:
    # dbutils.widgets.get() is the standard method for reading Job parameters
    ENV = dbutils.widgets.get("env_name") 
    print(f"Environment configured by Job (ENV): {ENV}")
except Exception:
    # Fallback for interactive/manual execution
    ENV = 'TEST' 
    print(f"Interactive/Manual execution. Using default ENV: {ENV}")

# --- 2. Load Configuration ---
try:
    # Adjust path to config.yaml if necessary
    with open('../../config/config.yaml', 'r') as file:
        full_config = yaml.safe_load(file)
except FileNotFoundError:
    print("ERROR: 'config.yaml' file not found! Check the path.")
    raise

CFG = full_config.get(ENV)
if not CFG:
    raise ValueError(f"Configuration not found for environment: {ENV} in YAML file.")

# Define key configuration variables
catalog_name = CFG['catalog_name']
schema_name = CFG['schema_name']
volume_name = CFG['volume_name']
sql_table = CFG['sql_table_name']
LOGS_PATH = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/execution_logs/"
BRONZE_TABLE_PATH = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/yfinance_bronze_data/"


In [0]:
# --- 3. Get Tickers from Databricks SQL Table ---
try:
    sql_query = f"SELECT Ticket, company_name FROM {sql_table}"
    spark_df_tickets = spark.sql(sql_query)
    df_tickets_list = spark_df_tickets.collect()
    company_info_dict = {row['Ticket']: row['company_name'] for row in df_tickets_list}
    gpw_ticket_list = list(company_info_dict.keys())
    print("Successfully created Spark DataFrame from SQL table.")
    print("Tickers to process:", gpw_ticket_list)
except Exception as e:
    print(f"Error reading from SQL table: {e}.")
    gpw_ticket_list = []
    company_info_dict = {}

In [0]:
# Create the base directory if it doesn't exist
try:
    dbutils.fs.mkdirs(BRONZE_TABLE_PATH)
    print(f"Successfully created directory for the SINGLE BRONZE TABLE: {BRONZE_TABLE_PATH}")
except Exception as e:
    print(f"Directory already exists or an error occurred during creation: {e}")

# Set end date for yfinance download
end_date = date.today()

if not gpw_ticket_list:
    print("No company data to process. Using fallback data for demonstration.")

In [0]:
# Helper function to check if a path is a Delta Table
def is_delta_table(path):
    """Checks if the given path contains a _delta_log folder (i.e., is a Delta Table)."""
    try:
        # dbutils.fs.ls will raise an error if the path does not exist.
        dbutils.fs.ls(os.path.join(path, "_delta_log")) 
        return True
    except Exception:
        return False

#schema
bronze_schema = T.StructType([
    T.StructField("Date", T.DateType(), True),
    T.StructField("Open", T.DoubleType(), True),
    T.StructField("High", T.DoubleType(), True),
    T.StructField("Low", T.DoubleType(), True),
    T.StructField("Close", T.DoubleType(), True),
    T.StructField("Volume", T.LongType(), True)
])

try:
    print(f"Starting Bronze Ingestion for environment: {ENV}")
    log_execution(spark, "01_BRONZE_INGESTION", "STARTED", LOGS_PATH)

    is_full_table_existing = is_delta_table(BRONZE_TABLE_PATH)

    print("\nStarting data download from yfinance and saving to Delta Lake...")

    # --- 4. Main Ingestion Loop (Per Ticker) ---
    for t in gpw_ticket_list:
        yfinance_ticker = f"{t}.WA" # Assumes tickers are for Warsaw Stock Exchange (.WA)
        print(f"\nProcessing stock: {yfinance_ticker}")

        # --- DOWNLOAD LOGIC ---
        start_date_for_update = None
        last_download_date = None

        if is_full_table_existing:
            try:
                # READ FROM SINGLE LARGE TABLE: Read the maximum date FOR THIS SPECIFIC TICKER
                # Using a filter predicate that acts as partition pruning
                max_date_df = spark.read.format("delta").load(BRONZE_TABLE_PATH) \
                                    .filter(F.col("Ticket") == t) \
                                    .select(F.max(F.col("Date").cast("date")).alias("max_date"))

                last_download_date = max_date_df.collect()[0]['max_date']

                if last_download_date and last_download_date < date.today():
                    # Set start date to the day after the last downloaded date
                    start_date_for_update = (pd.to_datetime(last_download_date) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')
                    print(f"Data for '{yfinance_ticker}' requires update starting from date: {start_date_for_update}.")
                elif last_download_date:
                    print(f"Data for '{yfinance_ticker}' is up to date as of {last_download_date}. Skipping.")
                    time.sleep(1) # Avoid overwhelming the API
                    continue
                else:
                    # If last_download_date is None (ticker not in the table or table is empty)
                    print(f"Ticker '{yfinance_ticker}' not found in the main table. Full download initiated.")
                    
            except Exception as e:
                print(f"An error occurred while reading/filtering the main Delta Table for '{yfinance_ticker}': {e}. Initiating full re-download for safety.")
                # If there's an error, we rely on the logic below to handle full download
                
        # Set the starting date for yfinance
        data = None
        if not is_full_table_existing or not last_download_date:
            # Full download from max available period (if table didn't exist or ticker was new)
            data = yf.download(yfinance_ticker, period='max', end=end_date)
            print(f"Starting full download for '{yfinance_ticker}'.")
        elif start_date_for_update:
            # Incremental download (if table existed and ticker needed updating)
            data = yf.download(yfinance_ticker, start=start_date_for_update, end=end_date)
            print(f"Downloading incremental data for '{yfinance_ticker}'.")
        else:
            # Data is up to date (handled by 'continue' in the block above, but safe guard)
            time.sleep(1)
            continue

        # --- 5. Delta Lake Write/Merge Logic ---
        if data is not None and not data.empty:
            # Prepare data for Spark DataFrame write
            df_new_data = data.reset_index()
            
            # Handle MultiIndex after download (if it occurs)
            if isinstance(df_new_data.columns, pd.MultiIndex):
                df_new_data.columns = df_new_data.columns.map(lambda x: x[0])
                
            # Convert to Spark DataFrame
            spark_new_data_df = spark.createDataFrame(df_new_data, schema=bronze_schema)

            # Cleanup and Add Columns
            spark_new_data_df = spark_new_data_df.withColumn("Date", col("Date").cast(DateType())) \
                                                .withColumn("Ticket", lit(t)) \
                                                .withColumn("company_name", lit(company_info_dict.get(t, 'N/A')))
                                                
            # Ensure DF has the required columns
            cols_to_select = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Ticket', 'company_name']
            spark_new_data_df = spark_new_data_df.select([F.col(c) for c in cols_to_select if c in spark_new_data_df.columns])

            # A) If table DOES NOT EXIST (first run), create a new Delta Lake table
            if not is_full_table_existing:
                try:
                    # Create the first version of the SINGLE MAIN Delta Table (partitioned by Ticket)
                    spark_new_data_df.write.format("delta").mode("overwrite").partitionBy("Ticket").save(BRONZE_TABLE_PATH)
                    print(f"Downloaded and saved **first full batch** to the SINGLE Delta Table.")
                    #constraints
                    spark.sql(f"ALTER TABLE delta.`{BRONZE_TABLE_PATH}` ADD CONSTRAINT date_not_null CHECK (Date IS NOT NULL)")
                    spark.sql(f"ALTER TABLE delta.`{BRONZE_TABLE_PATH}` ADD CONSTRAINT ticket_not_null CHECK (Ticket IS NOT NULL)")
                    is_full_table_existing = True # Mark that the table now exists
                except Exception as e:
                    print(f"ERROR WRITING FULL DELTA TABLE for '{yfinance_ticker}': {e}")
                    
            # B) If the table EXISTS, use MERGE
            else:
                try:
                    # Use MERGE INTO for transactional update/insert in the SINGLE MAIN TABLE
                    deltaTable = DeltaTable.forPath(spark, BRONZE_TABLE_PATH)
                    
                    deltaTable.alias("target") \
                        .merge(
                            source=spark_new_data_df.alias("source"),
                            # Join condition based on 'Date' and 'Ticket' (business key columns)
                            condition="target.Date = source.Date AND target.Ticket = source.Ticket"
                        ) \
                        .whenMatchedUpdateAll() \
                        .whenNotMatchedInsertAll() \
                        .execute()
                    
                    print(f"Updated data for '{yfinance_ticker}' using **MERGE INTO** in the SINGLE Delta Lake Table.")
                except Exception as e:
                    print(f"ERROR MERGE INTO DELTA LAKE for '{yfinance_ticker}': {e}.")
                    
        else:
            print(f"No new data to download or process for '{yfinance_ticker}'.")
            
        time.sleep(0.5) # Avoid overwhelming the API
    log_execution(spark, "01_BRONZE_INGESTION", "SUCCESS", LOGS_PATH)    
    print("\n--- COMPLETED UPDATE/INSERT PROCESS TO THE SINGLE MAIN DELTA LAKE TABLE ---")
except Exception as e:
    # --- BŁĄD ---
    error_msg = str(e)[:500]
    log_execution(spark, "01_BRONZE_INGESTION", "FAILED", LOGS_PATH, message=error_msg)
    print(f"\nFATAL ERROR in Bronze process: {e}")
    raise e