In [0]:
%pip install openpyxl

In [0]:
import re

def get_valid_parameter(parameter_key: str):
    """
    Get a valid parameter value from the Databricks widgets.
    Hardened to thwart SQL injection attacks.
    """
    parameter_value = dbutils.widgets.get(parameter_key)
    
    # Parameter_value must be a string with only alphanumeric characters and underscores
    if not re.fullmatch(r'[a-zA-Z0-9_]+', parameter_value):
        raise ValueError(f"Invalid parameter value for {parameter_key}: {parameter_value}")
    
    # Disallow dangerous SQL keywords and patterns
    forbidden_patterns = [
        r'--', r';', r"'", r'"', r'/\*', r'\*/', r'xp_', r'char\(', r'nchar\(', r'varchar\(', r'\balter\b', r'\bdrop\b', r'\binsert\b', r'\bdelete\b', r'\bupdate\b', r'\bselect\b', r'\bcreate\b', r'\bexec\b', r'\bunion\b', r'\bor\b', r'\band\b'
    ]
    for pattern in forbidden_patterns:
        if re.search(pattern, parameter_value, re.IGNORECASE):
            raise ValueError(f"Potentially dangerous value for {parameter_key}: {parameter_value} (pattern matched: {pattern})")
    return parameter_value

# Process daily analytics data

In [0]:
import pandas as pd
import re
from delta.tables import DeltaTable

# 1. define our constants
try:
  LINKEDIN_PROFILE_NAME = get_valid_parameter("LINKEDIN_PROFILE_NAME")

  LANDING_CATALOG = get_valid_parameter("LANDING_CATALOG")
  LANDING_SCHEMA = get_valid_parameter("LANDING_SCHEMA")
  LANDING_DAILY_VOLUME = get_valid_parameter("LANDING_DAILY_VOLUME")

  PENDING_FOLDER = get_valid_parameter("PENDING_FOLDER")
  PROCESSED_FOLDER = get_valid_parameter("PROCESSED_FOLDER")
  ERRORS_FOLDER = get_valid_parameter("ERRORS_FOLDER")

  BRONZE_CATALOG = get_valid_parameter("BRONZE_CATALOG")
  BRONZE_SCHEMA = get_valid_parameter("BRONZE_SCHEMA")
  BRONZE_DISCOVERY_TABLE = get_valid_parameter("BRONZE_DISCOVERY_TABLE")
  BRONZE_TOTALS_TABLE = get_valid_parameter("BRONZE_TOTALS_TABLE")
  BRONZE_FOLLOWERS_TABLE = get_valid_parameter("BRONZE_FOLLOWERS_TABLE")

  BRONZE_IMPRESSIONS_TABLE = get_valid_parameter("BRONZE_IMPRESSIONS_TABLE")
  BRONZE_ENGAGEMENTS_TABLE = get_valid_parameter("BRONZE_ENGAGEMENTS_TABLE")

  print("Loaded all widget values")
except Exception as e:
  LINKEDIN_PROFILE_NAME = r"[a-zA-Z0-9]+" # use your own profile name here

  LANDING_CATALOG = "landing"
  LANDING_SCHEMA = "linkedin"
  LANDING_DAILY_VOLUME = "content_daily"

  PENDING_FOLDER = "pending"
  PROCESSED_FOLDER = "processed"
  ERRORS_FOLDER = "errors"

  BRONZE_CATALOG = "bronze"
  BRONZE_SCHEMA = "linkedin"
  BRONZE_DISCOVERY_TABLE = "discovery"
  BRONZE_TOTALS_TABLE = "totals"
  BRONZE_FOLLOWERS_TABLE = "followers"

  BRONZE_IMPRESSIONS_TABLE = "impressions"
  BRONZE_ENGAGEMENTS_TABLE = "engagements"

  print(f"Failed to load widget values ({e}), using default values")

# 2. set our input and output variables
source_volume = \
  f"/Volumes/{LANDING_CATALOG}/{LANDING_SCHEMA}/{LANDING_DAILY_VOLUME}/"

landing_pending_folder = f"{source_volume}{PENDING_FOLDER}/"
landing_processed_folder = f"{source_volume}{PROCESSED_FOLDER}/"
landing_errors_folder = f"{source_volume}{ERRORS_FOLDER}/"

bronze_totals_table = \
  f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_TOTALS_TABLE}"
bronze_discovery_table = \
  f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_DISCOVERY_TABLE}"

bronze_followers_table = \
  f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_FOLLOWERS_TABLE}"

bronze_impressions_table_prefix = \
    f'{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_IMPRESSIONS_TABLE}'
bronze_engagements_table_prefix = \
    f'{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_ENGAGEMENTS_TABLE}'

# 3. execute the ingestion
ingestion_timestamp = pd.Timestamp.utcnow()

# extract the list of files from the pending folder
daily_files_info = [
    (f.path, pd.to_datetime(f.modificationTime, unit='ms', utc=True)) 
    for f in dbutils.fs.ls(landing_pending_folder)
]

# get current count of tables in bronze schema
table_count_in_bronze_schema = spark.sql(
    f"SHOW TABLES IN {BRONZE_CATALOG}.{BRONZE_SCHEMA}"
).count()
dates_processed = []

for file_path, file_timestamp in daily_files_info:

  tables_to_create = 0  
  
  # extract filename from file path
  filename = file_path.split('/')[-1]
  
  # define source and target paths for file
  pending_path = landing_pending_folder + filename
  processed_path = landing_processed_folder + filename
  errors_path = landing_errors_folder + filename

  # check if filename is of expected format (from and to date are the same)
  if re.search(
    r'Content_(\d{4}-\d{2}-\d{2})_\1_' 
    + LINKEDIN_PROFILE_NAME + r'\.xlsx', filename
  ):
      
    # extract date of analytics and append it to staging table names
    analytics_date_str = re.search(
      r'Content_(\d{4}-\d{2}-\d{2})_\1_', filename
    ).group(1)
    analytics_date = pd.to_datetime(analytics_date_str).date()
    bronze_engagements_table = \
      f"{bronze_engagements_table_prefix}_{
          analytics_date_str.replace('-', '_')
        }"
    bronze_impressions_table = \
      f"{bronze_impressions_table_prefix}_{
          analytics_date_str.replace('-', '_')
        }"

    # check if bronze_engagement_table exists in bronze schema
    if not spark.catalog.tableExists(bronze_engagements_table):
      tables_to_create += 1

    # check if bronze_impressions_table exists in bronze schema
    if not spark.catalog.tableExists(bronze_impressions_table):
      tables_to_create += 1

    # exit if too many tables in bronze schema (specific to Databricks Free Edition)
    if table_count_in_bronze_schema + tables_to_create > 100:
        print("Too many tables in bronze schema. Please process and clear staging tables before rerunning.")
        break

    # process valid filename
    try:
      try:
        # read members reached from xlsx file
        print(f"Processing DISCOVERY sheet in {pending_path}")
        discovery_df = pd.read_excel(
          pending_path, 
          sheet_name="DISCOVERY", 
        ).set_index("Overall Performance").transpose().dropna().reset_index(drop=True)
        discovery_df.columns = discovery_df.columns.str.lower().str.replace(' ', '_') 

        discovery_df = discovery_df[['members_reached']]
        discovery_df['date'] = analytics_date
        discovery_df['ingestion_timestamp'] = ingestion_timestamp
        discovery_df['source_file'] = filename
        discovery_df['source_file_timestamp'] = file_timestamp

        # Write members reached to Delta table with upsert logic
        print(f"Writing to {bronze_discovery_table}")
        if not discovery_df.empty:
          if spark.catalog.tableExists(bronze_discovery_table):
            delta_table = DeltaTable.forName(spark, bronze_discovery_table)
            delta_table.alias("t").merge(
              spark.createDataFrame(discovery_df).alias("s"),
              "t.date = s.date"
            ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
          else:
            spark.createDataFrame(discovery_df).write.format("delta").saveAsTable(
              bronze_discovery_table
            )
      except Exception as e:
        print(f"Error writing to {bronze_discovery_table}: {e}")
        exit()

      # read totals from xlsx file
      print(f"Processing ENGAGEMENT sheet in {pending_path}")
      totals_df = pd.read_excel(
        pending_path, 
        sheet_name="ENGAGEMENT", 
        parse_dates=["Date"]
      ).dropna()
      totals_df.columns = totals_df.columns.str.lower().str.replace(' ', '_') 
      totals_df['ingestion_timestamp'] = ingestion_timestamp
      totals_df['source_file'] = filename
      totals_df['source_file_timestamp'] = file_timestamp
      
      # Write totals to Delta table with upsert logic
      print(f"Writing to {bronze_totals_table}")
      if not totals_df.empty:
        if spark.catalog.tableExists(bronze_totals_table):
          delta_table = DeltaTable.forName(spark, bronze_totals_table)
          delta_table.alias("t").merge(
            spark.createDataFrame(totals_df).alias("s"),
            "t.date = s.date"
          ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        else:
          spark.createDataFrame(totals_df).write.format("delta").saveAsTable(
            bronze_totals_table
          )

      # read followers from xlsx file
      print(f"Processing FOLLOWERS sheet in {pending_path}")
      followers_df = pd.read_excel(
        pending_path, 
        sheet_name="FOLLOWERS", 
        parse_dates=["Date"],
        skiprows=2
      ).dropna()
      followers_df.columns = followers_df.columns.str.lower().str.replace(' ', '_') 
      followers_df['ingestion_timestamp'] = ingestion_timestamp
      followers_df['source_file'] = filename
      followers_df['source_file_timestamp'] = file_timestamp
      
      # Write followers to Delta table with upsert logic
      print(f"Writing to {bronze_followers_table}")
      if not followers_df.empty:
        if spark.catalog.tableExists(bronze_followers_table):
          delta_table = DeltaTable.forName(spark, bronze_followers_table)
          delta_table.alias("t").merge(
            spark.createDataFrame(followers_df).alias("s"),
            "t.date = s.date"
          ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        else:
          spark.createDataFrame(followers_df).write.format("delta").saveAsTable(
            bronze_followers_table
          )

      # read top posts from xlsx file  
      print(f"Processing TOP POSTS sheet in {pending_path}")
      topposts_df = pd.read_excel(pending_path, sheet_name="TOP POSTS", skiprows=2)

      engagements_df = topposts_df.iloc[:, :3].dropna()
      if not engagements_df.empty:  
        engagements_df.columns = engagements_df.columns.str.replace(' ', '_').str.lower()
        engagements_df['analytics_date'] = analytics_date
        engagements_df['ingestion_timestamp'] = ingestion_timestamp
        engagements_df['source_file'] = filename
        engagements_df['source_file_timestamp'] = file_timestamp

      impressions_df = topposts_df.iloc[:, 4:].dropna()
      if not impressions_df.empty:
        impressions_df.columns = impressions_df.columns.str.replace('.1', '', regex=False).str.replace(' ', '_').str.lower()
        impressions_df['analytics_date'] = analytics_date
        impressions_df['ingestion_timestamp'] = ingestion_timestamp
        impressions_df['source_file'] = filename
        impressions_df['source_file_timestamp'] = file_timestamp

      # Write engagements to Delta staging table, overwrite existing      
      if not engagements_df.empty:
        print(f"Writing to {bronze_engagements_table}")
        spark.createDataFrame(engagements_df).write.mode("overwrite").saveAsTable(
          bronze_engagements_table
        )
      else:
        print(f"Skipped writing empty dataset to {bronze_engagements_table}")

      # Write impressions to Delta staging table, overwrite existing
      
      if not impressions_df.empty:
        print(f"Writing to {bronze_impressions_table}")
        spark.createDataFrame(impressions_df).write.mode("overwrite").saveAsTable(
          bronze_impressions_table
        )
      else:
        print(f"Skipped writing empty dataset to {bronze_impressions_table}")

      # update counters for dates processed and tables in bronze schema
      dates_processed.append(analytics_date_str)  
      table_count_in_bronze_schema += tables_to_create

      print(f"Processed: Moving {pending_path} to {processed_path}")
      dbutils.fs.mv(pending_path, processed_path)


    except Exception as e:
      print(e)
      print(f"Errors encountered: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
  else:
    # move invalid filename to errors folder
    try:
      print(f"Invalid filename: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
    except Exception as e:
      print(f"Failed to move file {pending_path}: {e}")

if len(dates_processed) == 0:
    print("No dates processed.")
else:
    print(f"Dates processed: {dates_processed}")

