In [0]:
%pip install openpyxl

In [0]:
try:
    import sys
    import os

    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../utils')))

    from pipeline_utils import get_valid_parameter
except:
    import re

    def get_valid_parameter(parameter_key: str):
        """
        Get a valid parameter value from the Databricks widgets.
        Hardened to thwart SQL injection attacks.
        """
        parameter_value = dbutils.widgets.get(parameter_key)
        
        # Parameter_value must be a string with only alphanumeric characters and underscores
        if not re.fullmatch(r'[a-zA-Z0-9_]+', parameter_value):
            raise ValueError(f"Invalid parameter value for {parameter_key}: {parameter_value}")
        
        # Disallow dangerous SQL keywords and patterns
        forbidden_patterns = [
            r'--', r';', r"'", r'"', r'/\*', r'\*/', r'xp_', r'char\(', r'nchar\(', r'varchar\(', r'\balter\b', r'\bdrop\b', r'\binsert\b', r'\bdelete\b', r'\bupdate\b', r'\bselect\b', r'\bcreate\b', r'\bexec\b', r'\bunion\b', r'\bor\b', r'\band\b'
        ]
        for pattern in forbidden_patterns:
            if re.search(pattern, parameter_value, re.IGNORECASE):
                raise ValueError(f"Potentially dangerous value for {parameter_key}: {parameter_value} (pattern matched: {pattern})")
        return parameter_value


# Process historical analytics data


In [0]:
import pandas as pd
import re
from delta.tables import DeltaTable

# 1. define our constants
try:
  LINKEDIN_PROFILE_NAME = get_valid_parameter("LINKEDIN_PROFILE_NAME")

  LANDING_CATALOG = get_valid_parameter("LANDING_CATALOG")
  LANDING_SCHEMA = get_valid_parameter("LANDING_SCHEMA")
  LANDING_DAILY_VOLUME = get_valid_parameter("LANDING_DAILY_VOLUME")

  PENDING_FOLDER = get_valid_parameter("PENDING_FOLDER")
  PROCESSED_FOLDER = get_valid_parameter("PROCESSED_FOLDER")
  ERRORS_FOLDER = get_valid_parameter("ERRORS_FOLDER")

  BRONZE_CATALOG = get_valid_parameter("BRONZE_CATALOG")
  BRONZE_SCHEMA = get_valid_parameter("BRONZE_SCHEMA")
  BRONZE_TOTALS_TABLE = get_valid_parameter("BRONZE_TOTALS_TABLE")
  BRONZE_FOLLOWERS_TABLE = get_valid_parameter("BRONZE_FOLLOWERS_TABLE")

  print("Loaded all widget values")
except:
  LINKEDIN_PROFILE_NAME = r"[a-zA-Z0-9]+" # use your own profile name here

  LANDING_CATALOG = "landing"
  LANDING_SCHEMA = "linkedin"
  LANDING_DAILY_VOLUME = "content_historical"

  PENDING_FOLDER = "pending"
  PROCESSED_FOLDER = "processed"
  ERRORS_FOLDER = "errors"

  BRONZE_CATALOG = "bronze"
  BRONZE_SCHEMA = "linkedin"
  BRONZE_TOTALS_TABLE = "totals"
  BRONZE_FOLLOWERS_TABLE = "followers"

  print("Failed to load widget values, using default values")

# 2. set our input and output variables
source_volume = \
  f"/Volumes/{LANDING_CATALOG}/{LANDING_SCHEMA}/{LANDING_DAILY_VOLUME}/"

landing_pending_folder = f"{source_volume}{PENDING_FOLDER}/"
landing_processed_folder = f"{source_volume}{PROCESSED_FOLDER}/"
landing_errors_folder = f"{source_volume}{ERRORS_FOLDER}/"

bronze_totals_table = \
  f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_TOTALS_TABLE}"

bronze_followers_table = \
  f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_FOLLOWERS_TABLE}"


# 3. execute the ingestion
ingestion_timestamp = pd.Timestamp.utcnow()

# extract the list of files from the pending folder
historical_files_info = [
    (f.path, pd.to_datetime(f.modificationTime, unit='ms', utc=True)) 
    for f in dbutils.fs.ls(landing_pending_folder)
]

for file_path, file_timestamp in historical_files_info:

  # extract filename from file path
  filename = file_path.split('/')[-1]
  
  # define source and target paths for file
  pending_path = landing_pending_folder + filename
  processed_path = landing_processed_folder + filename
  errors_path = landing_errors_folder + filename

  # check if filename is of expected format
  if re.search(
    r'Content_\d{4}-\d{2}-\d{2}_\d{4}-\d{2}-\d{2}_' 
    + LINKEDIN_PROFILE_NAME + r'\.xlsx', filename
  ):
    # process valid filename
    try:
      # read totals from xlsx file
      print(f"Processing ENGAGEMENT sheet in {pending_path}")
      totals_df = pd.read_excel(
        pending_path, 
        sheet_name="ENGAGEMENT", 
        parse_dates=["Date"]
      ).dropna().iloc[:-1] 
      totals_df.columns = totals_df.columns.str.lower().str.replace(' ', '_') 
      totals_df['ingestion_timestamp'] = ingestion_timestamp
      totals_df['source_file'] = filename
      totals_df['source_file_timestamp'] = file_timestamp
      
      # Write totals to Delta table with upsert logic
      print(f"Writing to {bronze_totals_table}")
      if not totals_df.empty:
        if spark.catalog.tableExists(bronze_totals_table):
          delta_table = DeltaTable.forName(spark, bronze_totals_table)
          delta_table.alias("t").merge(
            spark.createDataFrame(totals_df).alias("s"),
            "t.date = s.date"
          ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        else:
          spark.createDataFrame(totals_df).write.format("delta").saveAsTable(
            bronze_totals_table
          )

      # read followers from xlsx file
      print(f"Processing FOLLOWERS sheet in {pending_path}")
      followers_df = pd.read_excel(
        pending_path, 
        sheet_name="FOLLOWERS", 
        parse_dates=["Date"],
        skiprows=2
      ).dropna().iloc[:-1]
      followers_df.columns = followers_df.columns.str.lower().str.replace(' ', '_') 
      followers_df['ingestion_timestamp'] = ingestion_timestamp
      followers_df['source_file'] = filename
      followers_df['source_file_timestamp'] = file_timestamp
      
      # Write followers to Delta table with upsert logic
      print(f"Writing to {bronze_followers_table}")
      if not followers_df.empty:
        if spark.catalog.tableExists(bronze_followers_table):
          delta_table = DeltaTable.forName(spark, bronze_followers_table)
          delta_table.alias("t").merge(
            spark.createDataFrame(followers_df).alias("s"),
            "t.date = s.date"
          ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        else:
          spark.createDataFrame(followers_df).write.format("delta").saveAsTable(
            bronze_followers_table
          )

      print(f"Processed: Moving {pending_path} to {processed_path}")
      dbutils.fs.mv(pending_path, processed_path)
    except Exception as e:
      print(e)
      print(f"Errors encountered: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
  else:
    # move invalid filename to errors folder
    try:
      print(f"Invalid filename: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
    except Exception as e:
      print(f"Failed to move file {pending_path}: {e}")
