In [None]:
import os
import re
from datetime import datetime
from ftplib import FTP_TLS
import json

# Configuration
env = dbutils.widgets.get("environment")
execution_date = dbutils.widgets.get("execution_date_main")
execution_date_dt = datetime.strptime(execution_date, "%Y-%m-%d")


# Get secrets
host = dbutils.secrets.get(scope="state_reporting", key=f"iowa_ftp_host_{env}")
username = dbutils.secrets.get(scope="state_reporting", key=f"iowa_ftp_user_{env}")
password = dbutils.secrets.get(scope="state_reporting", key=f"iowa_ftp_password_{env}")

max_query = """
    SELECT MAX(createdAt) AS max_createdAt
    FROM state_reporting_prd.bronze.state_batch_customer_data_ia
"""

max_proccessed_bf = spark.sql(max_query)
reference_date = max_proccessed_bf.collect()[0]["max_createdAt"]  # Extract value

try:
    with FTP_TLS(host) as ftp:
        ftp.login(username, password)
        ftp.prot_p()
        available_files = ftp.nlst()  # Get file list from FTP

        if available_files:
            valid_files = [file for file in available_files if file.startswith("Intoxalock_") and file.endswith(".zip")]

            # Clean filenames and filter, converting to YYYY-MM-DD format
            files_to_process = [
                datetime.strptime(re.sub(r"Intoxalock_|\.zip", "", file), "%Y%m%d").strftime("%Y-%m-%d")
                for file in valid_files
                if reference_date < (file_date := datetime.strptime(re.sub(r"Intoxalock_|\.zip", "", file), "%Y%m%d")) <= execution_date_dt
            ]
            dbutils.jobs.taskValues.set(key="files_to_process", value=json.dumps(files_to_process))
            print(files_to_process)  # Output JSON-like list of dictionaries

except Exception as e:
    print(f"FTP connection failed: {e}")