# 1. Silver Layer: Impressions ETL

This section processes daily bronze impressions tables, standardizes date formats, merges data into the silver impressions table, and logs ingestion events. Source tables are dropped after processing to maintain workspace hygiene.

In [0]:
try:
    import sys

    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../utils')))

    from pipeline_utils import get_valid_parameter
except:
    import re

    def get_valid_parameter(parameter_key: str):
        """
        Get a valid parameter value from the Databricks widgets.
        Hardened to thwart SQL injection attacks.
        """
        parameter_value = dbutils.widgets.get(parameter_key)
        
        # Parameter_value must be a string with only alphanumeric characters and underscores
        if not re.fullmatch(r'[a-zA-Z0-9_]+', parameter_value):
            raise ValueError(f"Invalid parameter value for {parameter_key}: {parameter_value}")
        
        # Disallow dangerous SQL keywords and patterns
        forbidden_patterns = [
            r'--', r';', r"'", r'"', r'/\*', r'\*/', r'xp_', r'char\(', r'nchar\(', r'varchar\(', r'\balter\b', r'\bdrop\b', r'\binsert\b', r'\bdelete\b', r'\bupdate\b', r'\bselect\b', r'\bcreate\b', r'\bexec\b', r'\bunion\b', r'\bor\b', r'\band\b'
        ]
        for pattern in forbidden_patterns:
            if re.search(pattern, parameter_value, re.IGNORECASE):
                raise ValueError(f"Potentially dangerous value for {parameter_key}: {parameter_value} (pattern matched: {pattern})")
        return parameter_value

In [0]:
import pandas as pd
import re
import datetime
from pyspark.sql import Row
from pyspark.sql.functions import sum
from delta.tables import DeltaTable

# 1. define our constants
try:
    BRONZE_CATALOG = get_valid_parameter("BRONZE_CATALOG")
    BRONZE_SCHEMA = get_valid_parameter("BRONZE_SCHEMA")
    BRONZE_IMPRESSIONS_TABLE_PREFIX = get_valid_parameter("BRONZE_IMPRESSIONS_TABLE_PREFIX")
    BRONZE_TOTALS_TABLE = get_valid_parameter("BRONZE_TOTALS_TABLE")

    SILVER_CATALOG = get_valid_parameter("SILVER_CATALOG")
    SILVER_SCHEMA = get_valid_parameter("SILVER_SCHEMA")
    SILVER_IMPRESSIONS_TABLE = get_valid_parameter("SILVER_IMPRESSIONS_TABLE")

    print("Loaded all widget values")
except:
    BRONZE_CATALOG = "bronze"
    BRONZE_SCHEMA = "linkedin"
    BRONZE_IMPRESSIONS_TABLE_PREFIX = "impressions"
    BRONZE_TOTALS_TABLE = "totals"

    SILVER_CATALOG = "silver"
    SILVER_SCHEMA = "linkedin"
    SILVER_IMPRESSIONS_TABLE = "impressions"

    print("Failed to load widget values, using default values")

# 2. set our input and output variables

# extract list of daily bronze staging impressions tables
bronze_impressions_tables = [
    table.name for table in spark.catalog.listTables(f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}")
    if re.match(rf"{BRONZE_IMPRESSIONS_TABLE_PREFIX}_\d{{4}}_\d{{2}}_\d{{2}}$", table.name)
]

silver_impressions_table = \
  f"{SILVER_CATALOG}.{SILVER_SCHEMA}.{SILVER_IMPRESSIONS_TABLE}"

# 3. execute the ingestion
ingestion_timestamp = datetime.datetime.utcnow()

# Process daily bronze staging impressions tables and merge into silver layer

print("Processing daily impressions tables...")

for bronze_impression_table in bronze_impressions_tables:
    # Extract date suffix from table name and convert to datetime in %Y_%m_%d format
    date_str = bronze_impression_table.replace(f"{BRONZE_IMPRESSIONS_TABLE_PREFIX}_", "")
    analytics_date = pd.to_datetime(date_str, format='%Y_%m_%d').date()

    table_name = f"{BRONZE_CATALOG}.{BRONZE_SCHEMA}.{bronze_impression_table}"
    
    print(f"Processing {table_name}...")

    silver_impressions_df = spark.sql(f"""
    SELECT 
        post_url,
        to_date(post_publish_date, 'M/d/yyyy') AS post_publish_date,
        impressions,
        to_date('{date_str}', 'yyyy_MM_dd') AS analytics_date 
    FROM {table_name}
    """)

    silver_impressions_totals_df = spark.sql(f"""
    SELECT 
        impressions,
        source_file,
        source_file_timestamp,
        ingestion_timestamp
    FROM {BRONZE_CATALOG}.{BRONZE_SCHEMA}.{BRONZE_TOTALS_TABLE}
    WHERE to_date('{date_str}', 'yyyy_MM_dd') = date
    """)

    impressions_others = silver_impressions_totals_df.select("impressions").collect()[0].impressions - silver_impressions_df.agg(sum("impressions")).collect()[0][0]
    if impressions_others > 0:
        silver_impressions_df = silver_impressions_df.union(
            spark.createDataFrame([
                Row(
                    post_url = "others",
                    post_publish_date = analytics_date,
                    impressions = impressions_others,
                    analytics_date = analytics_date,
                    # ingestion_timestamp = silver_impressions_totals_df.select("ingestion_timestamp").collect()[0].ingestion_timestamp,
                    # source_file = silver_impressions_totals_df.select("source_file").collect()[0].source_file,
                    # source_file_timestamp = silver_impressions_totals_df.select("source_file_timestamp").collect()[0].source_file_timestamp
                )
            ])
        )

    if spark.catalog.tableExists(silver_impressions_table):
        print(f"Table {silver_impressions_table} exists, merging data...")
        delta_table = DeltaTable.forName(spark, silver_impressions_table)
        (
            delta_table.alias("t")
            .merge(
                silver_impressions_df.alias("s"),
                "t.post_url = s.post_url AND t.analytics_date = s.analytics_date"
            )
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )
    else:
        print(f"Table {silver_impressions_table} does not exist, creating...")
        silver_impressions_df.write.format("delta").saveAsTable(silver_impressions_table)
    
    # Drop the source table after processing
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    print(f"Dropped table {table_name}")

# 2. Silver Layer: Fill in-between dates for impressions tables with 0 values


In [0]:
from pyspark.sql.functions import col, lit, sequence, explode, to_date, min as spark_min, max as spark_max

import pandas as pd
import re
import datetime
from pyspark.sql import Row
from pyspark.sql.functions import sum
from delta.tables import DeltaTable

# 1. define our constants
try:
    SILVER_CATALOG = get_valid_parameter("SILVER_CATALOG")
    SILVER_SCHEMA = get_valid_parameter("SILVER_SCHEMA")
    SILVER_IMPRESSIONS_TABLE = get_valid_parameter("SILVER_IMPRESSIONS_TABLE")

    print("Loaded all widget values")
except:
    SILVER_CATALOG = "silver"
    SILVER_SCHEMA = "linkedin"
    SILVER_IMPRESSIONS_TABLE = "impressions"

    print("Failed to load widget values, using default values")

silver_impressions_table = \
  f"{SILVER_CATALOG}.{SILVER_SCHEMA}.{SILVER_IMPRESSIONS_TABLE}"


try:
    import sys

    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../utils')))

    from pipeline_utils import fill_missing_dates
except:
    def fill_missing_dates(df, date_col, group_cols, value_col):
        # Get min and max dates
        date_range = df.select(spark_min(date_col).alias("min_date"), spark_max(date_col).alias("max_date")).collect()[0]
        min_date, max_date = date_range.min_date, date_range.max_date

        # Create full date sequence
        date_seq_df = spark.createDataFrame([(min_date, max_date)], ["start", "end"]) \
            .select(explode(sequence(col("start"), col("end"))).alias(date_col))

        # Cross join with unique post_url and post_publish_date
        unique_keys_df = df.select(*group_cols).distinct()
        full_grid_df = unique_keys_df.crossJoin(date_seq_df)

        # Left join to original df
        filled_df = full_grid_df.join(
            df,
            on=group_cols + [date_col],
            how="left"
        ).fillna({value_col: 0})

        return filled_df

# Fill missing dates for silver impressions
silver_impressions_filled_df = fill_missing_dates(
    spark.table(silver_impressions_table),
    "analytics_date",
    ["post_url", "post_publish_date"],
    "impressions"
)
