In [0]:
# 01_Bronze_Ingestion.py - Extracts data from the API and saves to the Bronze Layer

import requests
import json
from pyspark.sql.functions import current_timestamp, lit, col
from pyspark.sql.types import StringType
from requests.exceptions import RequestException

# Persistence Configuration (ADJUST IF NEEDED)
CATALOG_NAME = "workspace" 
SCHEMA_NAME = "default"
BRONZE_TABLE_NAME = "omdb_releases_bronze"
FULL_BRONZE_PATH = f"{CATALOG_NAME}.{SCHEMA_NAME}.{BRONZE_TABLE_NAME}"

# API Configuration
API_KEY = dbutils.secrets.get(scope="omdb_scope", key="api_key") 
BASE_URL = "http://www.omdbapi.com/"
search_term = "Star Wars" 
MAX_PAGES = 3             
current_page = 1
all_records = []

print(f"Starting API search for: {search_term}")

# --- 1. EXTRACTION (PYTHON) ---
# Loop through pages to collect all search results
while current_page <= MAX_PAGES:
    params = {"apikey": API_KEY, "s": search_term, "page": current_page}
    try:
        response = requests.get(BASE_URL, params=params, timeout=15)
        response.raise_for_status()
        data = response.json()
        if data.get("Response") == "True":
            all_records.extend(data.get("Search", []))
            current_page += 1
        else:
            break
    except RequestException as e:
        print(f" - Error during API request: {e}")
        break

# 01_Bronze_Ingestion.py - Extracts data from the API and saves to the Bronze Layer

import requests
import json
from pyspark.sql.functions import current_timestamp, lit, col
from pyspark.sql.types import StringType
from requests.exceptions import RequestException

# Persistence Configuration (ADJUST IF NEEDED)
CATALOG_NAME = "workspace" 
SCHEMA_NAME = "default"
BRONZE_TABLE_NAME = "omdb_releases_bronze"
FULL_BRONZE_PATH = f"{CATALOG_NAME}.{SCHEMA_NAME}.{BRONZE_TABLE_NAME}"

# API Configuration
API_KEY = dbutils.secrets.get(scope="omdb_scope", key="api_key") 
BASE_URL = "http://www.omdbapi.com/"
search_term = "Star Wars" 
MAX_PAGES = 3             
current_page = 1
all_records = []

print(f"Starting API search for: {search_term}")

# --- 1. EXTRACTION (PYTHON) ---
# Loop through pages to collect all search results
while current_page <= MAX_PAGES:
    params = {"apikey": API_KEY, "s": search_term, "page": current_page}
    try:
        response = requests.get(BASE_URL, params=params, timeout=15)
        response.raise_for_status()
        data = response.json()
        if data.get("Response") == "True":
            all_records.extend(data.get("Search", []))
            current_page += 1
        else:
            # Stops the loop if API response is 'False' or no more data
            break
    except RequestException as e:
        print(f" - Error during API request: {e}")
        break

print(f"\n--- 2. LOAD TO BRONZE LAYER ---")

# Check if the list is empty BEFORE trying to create the DataFrame
if not all_records:
    # This logs the error instead of using dbutils.notebook.exit(), which can cause syntax issues
    print("API returned no data. Exiting pipeline without writing data.")
else:
    # Convert Python List to Spark DataFrame
    df_bronze = spark.createDataFrame(all_records)

    # Add audit metadata and ensure raw data is treated as StringType
    df_bronze = df_bronze.select(*(col(c).cast(StringType()) for c in df_bronze.columns)) \
                         .withColumn("ingestion_timestamp", current_timestamp()) \
                         .withColumn("search_query", lit(search_term))

    # Persist as Delta Table (Overwrite the raw data on each run)
    df_bronze.write \
             .format("delta") \
             .mode("overwrite") \
             .saveAsTable(FULL_BRONZE_PATH)

    print(f"Bronze Table '{FULL_BRONZE_PATH}' created with {df_bronze.count()} records.")