In [0]:
%pip install openpyxl bs4

In [0]:
import re

def get_valid_parameter(parameter_key: str):
    """
    Get a valid parameter value from the Databricks widgets.
    Hardened to thwart SQL injection attacks.
    """
    parameter_value = dbutils.widgets.get(parameter_key)
    
    # Parameter_value must be a string with only alphanumeric characters and underscores
    if not re.fullmatch(r'[a-zA-Z0-9_]+', parameter_value):
        raise ValueError(f"Invalid parameter value for {parameter_key}: {parameter_value}")
    
    # Disallow dangerous SQL keywords and patterns
    forbidden_patterns = [
        r'--', r';', r"'", r'"', r'/\*', r'\*/', r'xp_', r'char\(', r'nchar\(', r'varchar\(', r'\balter\b', r'\bdrop\b', r'\binsert\b', r'\bdelete\b', r'\bupdate\b', r'\bselect\b', r'\bcreate\b', r'\bexec\b', r'\bunion\b', r'\bor\b', r'\band\b'
    ]
    for pattern in forbidden_patterns:
        if re.search(pattern, parameter_value, re.IGNORECASE):
            raise ValueError(f"Potentially dangerous value for {parameter_key}: {parameter_value} (pattern matched: {pattern})")
    return parameter_value

# 0. Set up constants

In [0]:
try:
    LINKEDIN_PROFILE_NAME = get_valid_parameter("LINKEDIN_PROFILE_NAME")

    LANDING_CATALOG = get_valid_parameter("LANDING_CATALOG")
    LANDING_SCHEMA = get_valid_parameter("LANDING_SCHEMA")
    LANDING_POSTS_VOLUME = get_valid_parameter("LANDING_POSTS_VOLUME")
    
    PENDING_FOLDER = get_valid_parameter("PENDING_FOLDER")
    PROCESSED_FOLDER = get_valid_parameter("PROCESSED_FOLDER")
    ERRORS_FOLDER = get_valid_parameter("ERRORS_FOLDER")

    BRONZE_CATALOG = get_valid_parameter("BRONZE_CATALOG")
    BRONZE_SCHEMA = get_valid_parameter("BRONZE_SCHEMA")
    BRONZE_IMPRESSIONS_TABLE = get_valid_parameter("BRONZE_IMPRESSIONS_TABLE")
    BRONZE_POSTS_TABLE = get_valid_parameter("BRONZE_POSTS_TABLE")
    BRONZE_POST_DETAILS_TABLE = get_valid_parameter("BRONZE_POST_DETAILS_TABLE")

    print("Loaded all widget values")
except:
    LINKEDIN_PROFILE_NAME = r"[a-zA-Z0-9]+" # use your own profile name here

    LANDING_CATALOG = "landing"
    LANDING_SCHEMA = "linkedin"
    LANDING_POSTS_VOLUME = "posts"

    PENDING_FOLDER = "pending"
    PROCESSED_FOLDER = "processed"
    ERRORS_FOLDER = "errors"

    BRONZE_CATALOG = "bronze"
    BRONZE_SCHEMA = "linkedin"
    BRONZE_IMPRESSIONS_TABLE = "impressions"
    BRONZE_POSTS_TABLE = "posts"
    BRONZE_POST_DETAILS_TABLE = "post_details"

    print("Failed to load widget values, using default values")


# 1. Helper functions for fetching and parsing LinkedIn post HTML content

In [0]:
# Helper functions for fetching and parsing LinkedIn post HTML content
import requests
from bs4 import BeautifulSoup
import time

def get_html_content(url: str, max_retries: int = 5, base_delay: float = 5.0):
    """
    Fetches HTML content from a LinkedIn post URL with exponential backoff.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    for attempt in range(max_retries):
        try:
            print(f"Fetching HTML content from URL: {url} (Attempt {attempt + 1})")
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            print(f"Successfully fetched HTML content from {url}")
            return response.content
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt)
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"Failed to fetch HTML content from {url} after {max_retries} attempts.")
                raise e

def get_content(html_content):
    """
    Extracts post content from HTML using BeautifulSoup.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    content_wrapper = soup.find('p', class_="attributed-text-segment-list__content")
    if content_wrapper:
        return content_wrapper.get_text()
    return None

# 2. Create patch table for LinkedIn posts if it doesn't exist

In [0]:
%sql
CREATE TABLE IF NOT EXISTS IDENTIFIER (:BRONZE_CATALOG || '.' || :BRONZE_SCHEMA || '.' || :BRONZE_POST_PATCH_TABLE) (
  post_url STRING,
  true_url STRING,
  title STRING,
  content STRING
);

# 3. Ingest post metadata

In [0]:
# Extract posts data using post_url from all impressions tables and save to bronze posts table
import pandas as pd
from pyspark.sql.functions import col, row_number, desc
from pyspark.sql.window import Window
from delta.tables import DeltaTable


bronze_impressions_table = f'{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_IMPRESSIONS_TABLE}'
bronze_posts_table = f'{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_POSTS_TABLE}'

# List all impressions staging tables in the bronze.linkedin schema
daily_impressions_tables = [table.name for table in spark.catalog.listTables(f'{BRONZE_CATALOG}.{BRONZE_SCHEMA}') if table.name.startswith(BRONZE_IMPRESSIONS_TABLE)]

print("Loading daily impressions tables to extract posts data...")
# Read each table and select the relevant columns
columns_to_select = [
  "post_url", "post_publish_date", 
  "ingestion_timestamp", "source_file", "source_file_timestamp"
]
dfs = [spark.read.table(f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{table}").select(*columns_to_select) for table in daily_impressions_tables]

# Union all dataframes and remove duplicates
if dfs:
    impressions_df = dfs[0]
    for df in dfs[1:]:
        impressions_df = impressions_df.unionByName(df)
    impressions_df = impressions_df.withColumn(
        "row_num",
        row_number().over(
            Window.partitionBy("post_url", "post_publish_date").orderBy(desc("ingestion_timestamp"))
        )
    ).filter(col("row_num") == 1).drop("row_num")
    impressions_pd = impressions_df.toPandas()[columns_to_select]
    impressions_pd['post_publish_date'] = pd.to_datetime(impressions_pd['post_publish_date'])
else:
    impressions_pd = pd.DataFrame(columns=columns_to_select)

if not impressions_pd.empty:
    # Fetch HTML content for each post URL
    impressions_pd['html_content'] = impressions_pd['post_url'].apply(get_html_content)
    # Extract link, title, and content from the HTML content
    impressions_pd['link'] = impressions_pd['html_content'].apply(lambda x: BeautifulSoup(x, "html.parser").find('link').get('href'))
    impressions_pd['title'] = impressions_pd['html_content'].apply(lambda x: BeautifulSoup(x, "html.parser").find("head").get_text(strip=True).split('|')[0].strip())
    impressions_pd['content'] = impressions_pd['html_content'].apply(get_content)

    impressions_spark_df = spark.createDataFrame(impressions_pd)
    if not spark.catalog.tableExists(bronze_posts_table):
        print(f"Creating {bronze_posts_table} table...")
        impressions_spark_df.write.format("delta").saveAsTable(bronze_posts_table)
    else:
        print(f"Appending new posts to {bronze_posts_table} table...")
        impressions_spark_df.write.format("delta").mode("append").saveAsTable(bronze_posts_table)
else:
    print("No new posts found based on impressions tables.")

# Display sample of posts data for verification
posts_sample_df = spark.read.table(bronze_posts_table).limit(10)
display(posts_sample_df)

# 4. Ingest post analytics data

In [0]:
import pandas as pd
import re
import datetime
from pyspark.sql import Row

# 1. set our input and output variables
source_volume = \
  f"/Volumes/{LANDING_CATALOG}/{LANDING_SCHEMA}/{LANDING_POSTS_VOLUME}/"

landing_pending_folder = f"{source_volume}{PENDING_FOLDER}/"
landing_processed_folder = f"{source_volume}{PROCESSED_FOLDER}/"
landing_errors_folder = f"{source_volume}{ERRORS_FOLDER}/"

bronze_post_details_table = \
  f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_POST_DETAILS_TABLE}"

# 2. execute the ingestion
ingestion_timestamp = datetime.datetime.utcnow()

# extract the list of files from the pending folder
post_files_info = [
    (f.path, pd.to_datetime(f.modificationTime, unit='ms', utc=True).to_pydatetime()) 
    for f in dbutils.fs.ls(landing_pending_folder)
]

for file_path, file_timestamp in post_files_info:
 
  # extract filename from file path
  filename = file_path.split('/')[-1]
  
  # define source and target paths for file
  pending_path = landing_pending_folder + filename
  processed_path = landing_processed_folder + filename
  errors_path = landing_errors_folder + filename

  # check if filename is of expected format
  if re.search(
    r'PostAnalytics_' + LINKEDIN_PROFILE_NAME + r'_\d+(?: \(\d+\))?\.xlsx', filename
  ):
      
    # process valid filename
    try:
      # read totals from xlsx file
      print(f"Extracting post URL and timestamp from PERFORMANCE sheet in {pending_path}")
      post_details_raw_df = pd.read_excel(
        pending_path, 
        sheet_name="PERFORMANCE", 
        header=None,
        names=["key", "value"],
      ).dropna(how='all')

      # Extract timestamp. post URL and date of analytics
      transposed_post_details_df = post_details_raw_df.dropna().set_index('key').transpose()
      post_timedelta_str = transposed_post_details_df["Post Publish Time"].iloc[0]

      # Extract date range of analytics from specific string pattern
      df_with_date_range = post_details_raw_df[post_details_raw_df['key'].str.contains(
          " Highlights ", na=False
        )]['key']
      if df_with_date_range.empty:
        print("Cannot detect analytics date")
        date_start_str = transposed_post_details_df["Post Date"].iloc[0]
        date_end_str = None
      else:
        date_range_str = \
          df_with_date_range.str.split(" Highlights ").iloc[0][1]
        [date_start_str, date_end_str] = date_range_str.split(" to ")

      post_timestamp = pd.to_datetime(date_start_str + ' ' + timedelta_str + ' UTC').to_pydatetime()

      post_url = transposed_post_details_df["Post URL"].iloc[0]
      if date_end_str:
        analytics_date = pd.to_datetime(date_end_str).to_pydatetime().date()
      else:
        analytics_date = None

      post_top_demographics_df = pd.read_excel(
        pending_path, 
        sheet_name="TOP DEMOGRAPHICS", 
      ).dropna()
      post_top_demographics_df.columns = post_top_demographics_df.columns.str.lower().str.replace('%', 'percent')

      # Fetch HTML content for each post URL
      print(f"Fetching HTML content and link for {post_url}")
      html_content = get_html_content(post_url)
      # Extract link from the HTML content
      link = BeautifulSoup(html_content, "html.parser").find('link').get('href')
      
      print(f"Preparing post details record for {link}")
      post_details_record = Row(
          post_url=post_url, 
          link=link,
          post_timestamp=post_timestamp,
          analytics_date=analytics_date,
          performance_metrics=str(transposed_post_details_df.to_dict()),
          demographics_metrics=str(post_top_demographics_df.to_dict()),
          ingestion_timestamp=ingestion_timestamp,
          source_file=file_path,
          source_file_timestamp=file_timestamp,
      )

      # Write post_details_record to Delta table with upsert logic based on latest analytics date
      print(f"Writing to {bronze_post_details_table}: {post_details_record}")
      post_details_spark_df = spark.createDataFrame([post_details_record])
      if spark.catalog.tableExists(bronze_post_details_table):
          delta_table = DeltaTable.forName(spark, bronze_post_details_table)
          delta_table.alias("t").merge(
              post_details_spark_df.alias("s"),
              "t.post_url = s.post_url and t.analytics_date = s.analytics_date"
          ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
      else:
          post_details_spark_df.write.format("delta").saveAsTable(
            bronze_post_details_table
          )

      print(f"Processed: Moving {pending_path} to {processed_path}")
      dbutils.fs.mv(pending_path, processed_path)


    except Exception as e:
      print(e)
      print(f"Errors encountered: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
  else:
    # move invalid filename to errors folder
    try:
      print(f"Invalid filename: Moving {pending_path} to {errors_path}")
      dbutils.fs.mv(pending_path, errors_path)
    except Exception as e:
      print(f"Failed to move file {pending_path}: {e}")



In [0]:
%sql
CREATE TABLE IF NOT EXISTS IDENTIFIER (:BRONZE_CATALOG || '.' || :BRONZE_SCHEMA || '.' || :BRONZE_POST_DETAILS_TABLE) (
  post_url STRING,
  link STRING,
  post_timestamp TIMESTAMP,
  analytics_date DATE,
  performance_metrics STRING,
  demographics_metrics STRING,
  ingestion_timestamp TIMESTAMP
);