# Silver Layer: Posts ETL
This section merges bronze posts and patch data, enriches post metadata, and writes the results to the silver posts table. Ingestion events are logged for traceability.

In [0]:
try:
    import sys
    import os

    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../utils')))

    from pipeline_utils import get_valid_parameter_value as get_valid_parameter
except:
    import re

    def get_valid_parameter(parameter_key: str):
        """
        Get a valid parameter value from the Databricks widgets.
        Hardened to thwart SQL injection attacks.
        """
        parameter_value = dbutils.widgets.get(parameter_key)
        
        # Parameter_value must be a string with only alphanumeric characters and underscores
        if not re.fullmatch(r'[a-zA-Z0-9_]+', parameter_value):
            raise ValueError(f"Invalid parameter value for {parameter_key}: {parameter_value}")
        
        # Disallow dangerous SQL keywords and patterns
        forbidden_patterns = [
            r'--', r';', r"'", r'"', r'/\*', r'\*/', r'xp_', r'char\(', r'nchar\(', r'varchar\(', r'\balter\b', r'\bdrop\b', r'\binsert\b', r'\bdelete\b', r'\bupdate\b', r'\bselect\b', r'\bcreate\b', r'\bexec\b', r'\bunion\b', r'\bor\b', r'\band\b'
        ]
        for pattern in forbidden_patterns:
            if re.search(pattern, parameter_value, re.IGNORECASE):
                raise ValueError(f"Potentially dangerous value for {parameter_key}: {parameter_value} (pattern matched: {pattern})")
        return parameter_value


In [0]:
# 1. define our constants
try:
    BRONZE_CATALOG = get_valid_parameter("BRONZE_CATALOG")
    BRONZE_SCHEMA = get_valid_parameter("BRONZE_SCHEMA")
    BRONZE_POSTS_TABLE = get_valid_parameter("BRONZE_POSTS_TABLE")
    BRONZE_POST_DETAILS_TABLE = get_valid_parameter("BRONZE_POST_DETAILS_TABLE")
    BRONZE_POST_PATCH_TABLE = get_valid_parameter("BRONZE_POST_PATCH_TABLE")

    SILVER_CATALOG = get_valid_parameter("SILVER_CATALOG")
    SILVER_SCHEMA = get_valid_parameter("SILVER_SCHEMA")
    SILVER_POSTS_TABLE = get_valid_parameter("SILVER_POSTS_TABLE")

    print("Loaded all widget values")
except Exception as e:
    BRONZE_CATALOG = "bronze"
    BRONZE_SCHEMA = "linkedin"
    BRONZE_POSTS_TABLE = "posts"
    BRONZE_POST_DETAILS_TABLE = "post_details"
    BRONZE_POST_PATCH_TABLE = "linkedin_patch"

    SILVER_CATALOG = "silver"
    SILVER_SCHEMA = "linkedin"
    SILVER_POSTS_TABLE = "posts"

    print(f"Failed to load widget values ({e}), using default values")


In [0]:
%python
from pyspark.sql import functions as F, Window

merged_post_dedup = spark.sql(f"""
SELECT
  post_url,
  post_publish_date,
  post_publish_timestamp,
  link,
  title,
  content
FROM (
  SELECT
    posts.post_url AS post_url,
    CAST(posts.post_publish_date AS DATE) AS post_publish_date,
    post_details.post_timestamp AS post_publish_timestamp,
    COALESCE(patch.true_url, posts.link) AS link,
    COALESCE(patch.title, posts.title) AS title,
    COALESCE(patch.content, posts.content, posts.title) AS content,
    ROW_NUMBER() OVER (PARTITION BY posts.post_url ORDER BY patch.true_url DESC, patch.title DESC, patch.content DESC) AS rn
  FROM
    {BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_POSTS_TABLE} posts
  LEFT JOIN
    {BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_POST_PATCH_TABLE} patch
  ON
    posts.post_url = patch.post_url
  LEFT JOIN
    {BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_POST_DETAILS_TABLE} post_details
  ON
    posts.post_url = post_details.post_url
)
WHERE rn = 1;
""")

# Create silver posts table if not exists
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {SILVER_CATALOG}.{SILVER_SCHEMA}.{SILVER_POSTS_TABLE} (
  post_url STRING,
  post_publish_date DATE,
  post_publish_timestamp TIMESTAMP,
  link STRING,
  title STRING,
  content STRING
) USING DELTA
""")

from delta.tables import DeltaTable

silver_posts = DeltaTable.forName(
    spark,
    f"{SILVER_CATALOG}.{SILVER_SCHEMA}.{SILVER_POSTS_TABLE}"
)

silver_posts.alias("t").merge(
    merged_post_dedup.alias("s"),
    "t.post_url = s.post_url"
).whenMatchedUpdate(set={
    "post_url": "s.post_url",
    "post_publish_date": "s.post_publish_date",
    "post_publish_timestamp": "s.post_publish_timestamp",
    "link": "s.link",
    "title": "s.title",
    "content": "s.content"
}).whenNotMatchedInsert(values={
    "post_url": "s.post_url",
    "post_publish_date": "s.post_publish_date",
    "post_publish_timestamp": "s.post_publish_timestamp",
    "link": "s.link",
    "title": "s.title",
    "content": "s.content"
}).execute()