In [0]:
import pandas as pd
import requests
from io import StringIO
import sys
import os

# --- TEST IMPORTS ---
# Adding the project root folder to sys.path so that imports work
sys.path.append(os.path.abspath('../../'))
from tests.meta_tests import verify_metadata_batch

FILE_ID = "1alcsznO40FQOQg68t2wXC0gPb09DbXvP" 
TARGET_TABLE = "workspace.google_drive.gpw_data"
URL = f"https://docs.google.com/spreadsheets/d/{FILE_ID}/export?format=csv"

print(f"--- STARTING METADATA SYNC ---")

try:
    # 1. Fetch data from Google Drive with timeout
    response = requests.get(URL, timeout=30)
    
    # Retry with alternative URL if the first attempt fails
    if response.status_code != 200:
        URL_ALT = f"https://docs.google.com/uc?export=download&id={FILE_ID}"
        response = requests.get(URL_ALT, timeout=30)
        
    if response.status_code != 200:
        raise Exception(f"Download failed. Status: {response.status_code}")

    # 2. Basic content validation (ensure we didn't receive an HTML login page)
    if "<html" in response.text.lower():
        raise Exception("Received HTML instead of CSV. Check Drive sharing permissions.")

    # 3. Load into Pandas for initial sanity check
    pdf = pd.read_csv(StringIO(response.text))
    
    if pdf.empty:
        raise Exception("The downloaded CSV is empty. Preventing table overwrite.")

    print(f"Successfully fetched {len(pdf)} rows.")

    # 4. Convert to Spark DataFrame and normalize column names
    spark_df = spark.createDataFrame(pdf)
    spark_df = spark_df.toDF(*[c.lower().replace(" ", "_") for c in spark_df.columns])

    # 5. Execute Data Quality Validation from external module
    is_valid, msg = verify_metadata_batch(spark_df)

    if is_valid:
        print(f"✅ Metadata validation passed. Updating control table: {TARGET_TABLE}")
        
        # 6. Save data to Delta Lake (Atomic Overwrite)
        # Dropping and recreating ensures the schema is refreshed if Google Drive columns changed
        spark.sql(f"DROP TABLE IF EXISTS {TARGET_TABLE}")
        spark_df.write.format("delta").mode("overwrite").saveAsTable(TARGET_TABLE)
        
        print(f"✅ SUCCESS: Table {TARGET_TABLE} synchronized.")
    else:
        # Halt execution if Data Quality requirements are not met
        error_msg = f"DATA VALIDATION FAILED: {msg}. Aborting process to protect downstream pipeline."
        print(f"❌ {error_msg}")
        raise Exception(error_msg)

except Exception as e:
    print(f"❌ ERROR: {str(e)}")
    # Re-raising ensures the Databricks Workflow captures the failure
    raise e