In [0]:
%pip install openpyxl bs4

# 0. Set up constants

In [0]:
try:
    LANDING_CATALOG = dbutils.widgets.get("LANDING_CATALOG")
    LANDING_SCHEMA = dbutils.widgets.get("LANDING_SCHEMA")
    LANDING_PATCH_VOLUME = dbutils.widgets.get("LANDING_PATCH_VOLUME")
    
    PENDING_FOLDER = dbutils.widgets.get("PENDING_FOLDER")
    PROCESSED_FOLDER = dbutils.widgets.get("PROCESSED_FOLDER")
    ERRORS_FOLDER = dbutils.widgets.get("ERRORS_FOLDER")
    POST_PATCH_SUBFOLDER = dbutils.widgets.get("POST_PATCH_SUBFOLDER")

    BRONZE_CATALOG = dbutils.widgets.get("BRONZE_CATALOG")
    BRONZE_SCHEMA = dbutils.widgets.get("BRONZE_SCHEMA")
    BRONZE_POST_PATCH_TABLE = dbutils.widgets.get("BRONZE_POST_PATCH_TABLE")

except:
    LANDING_CATALOG = "landing"
    LANDING_SCHEMA = "linkedin"
    LANDING_PATCH_VOLUME = "patch"

    PENDING_FOLDER = "pending"
    PROCESSED_FOLDER = "processed"
    ERRORS_FOLDER = "errors"
    POST_PATCH_SUBFOLDER = "posts"

    BRONZE_CATALOG = "bronze"
    BRONZE_SCHEMA = "linkedin"
    BRONZE_POST_DETAILS_TABLE = "post_patch"


In [0]:
import pandas as pd
from pyspark.sql import Row
from delta.tables import DeltaTable

import pandas as pd
import re
import datetime
from pyspark.sql import Row

# 1. set our input and output variables
source_volume = \
  f"/Volumes/{LANDING_CATALOG}/{LANDING_SCHEMA}/{LANDING_PATCH_VOLUME}/"

landing_pending_folder = f"{source_volume}{PENDING_FOLDER}/{POST_PATCH_SUBFOLDER}/"
landing_processed_folder = f"{source_volume}{PROCESSED_FOLDER}/{POST_PATCH_SUBFOLDER}/"
landing_errors_folder = f"{source_volume}{ERRORS_FOLDER}/{POST_PATCH_SUBFOLDER}/"

bronze_post_patch_table = \
  f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_POST_PATCH_TABLE}"

# 2. execute the ingestion
ingestion_timestamp = datetime.datetime.utcnow()

# extract the list of files from the pending folder
patch_files_info = [
    (f.path, pd.to_datetime(f.modificationTime, unit='ms', utc=True).to_pydatetime()) 
    for f in dbutils.fs.ls(landing_pending_folder)
]

for file_path, file_timestamp in patch_files_info:
 
  # extract filename from file path
  filename = file_path.split('/')[-1]
  
  # define source and target paths for file
  pending_path = landing_pending_folder + filename
  processed_path = landing_processed_folder + filename
  errors_path = landing_errors_folder + filename

  # check if filename is of expected format
  if filename.endswith('.xlsx'):
      
    # process valid filename
    try:
        # Read the Excel file into a pandas DataFrame
        df = pd.read_excel(file_path)

        # Define expected columns for the patch file
        expected_columns = {"post_url", "true_url", "title", "content"}
        if not expected_columns.issubset(set(df.columns)):
            print(f"Schema mismatch in {file_path}. Expected columns: {expected_columns}, found: {set(df.columns)}")
            dbutils.fs.mv(pending_path, errors_path)
            continue
        
        # Convert pandas DataFrame to Spark DataFrame
        spark_df = spark.createDataFrame(df)

        if spark.catalog.tableExists(bronze_post_patch_table):
            delta_table = DeltaTable.forName(spark, bronze_post_patch_table)
            delta_table.alias("t").merge(
                spark_df.alias("s"),
                "t.post_url = s.post_url"
            ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        else:
            spark_df.write.format("delta").saveAsTable(bronze_post_patch_table)

        print(f"Processed: Moving {pending_path} to {processed_path}")
        dbutils.fs.mv(pending_path, processed_path)

    except Exception as e:
      print(e)
      print(f"Errors encountered: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
  else:
    # move invalid filename to errors folder
    try:
      print(f"Invalid filename: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
    except Exception as e:
      print(f"Failed to move file {pending_path}: {e}")

