In [198]:
# Read sftp environment and copy to raw
from pyspark.sql.functions import *
from pyspark.sql.types import DecimalType
from delta.tables import *
import requests
from pyspark.sql import functions as F
from datetime import datetime, timedelta, date

In [200]:
%run /utils/common_functions

In [203]:
# Move files to raw data lake in Funnel folder
files = mssparkutils.fs.ls(f'abfss://funnel@{sftp_adls_path}/data')
print('Found', len(files), 'files.')

# Check if there are any files to move
if len(files) == 0 and (env_var == env_dict['prod'] or env_var == env_dict['prod_backup']):
    # Warn there might be an issue:
    print("No files found to move. Ending the job and sending notification.")
    response = requests.post(
        'https://prod-85.eastus.logic.azure.com:443/workflows/7367758ef3da4d76b4e64670220d6135/triggers/manual/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2Fmanual%2Frun&sv=1.0&sig=3qXzAxaCqincRFQ358oeDwq_SGn5vgzaNgW26QxUDMs',
        '{"email_to": "ededl@wwgroups.net", "email_subject": "No Funnel Data Available to Process", "email_body": "Did not find any file to process for daily Funnel data.", "email_from": "AzureSynapse@wwwinc.com"}',
        headers={"Content-Type": "application/json"}
    )

    # Exit the notebook when no files are available
    notebookutils.mssparkutils.notebook.exit(0)

elif len(files) == 0 and (env_var == env_dict['dev'] or env_var == env_dict['test']):
    # For Dev and Test environments
    print("No files found in Dev/Test environment. Skipping file processing.")
    
    # Exit the notebook when no files are available
    notebookutils.mssparkutils.notebook.exit(0)

else:
# Proceed if there are files
    for file in files:
        # Move the file to the raw data lake
        mssparkutils.fs.mv(src=file.path, dest=f'{raw_adls_path}Funnel/{datetime.now().strftime("%Y/%m/%d")}/{file.name}', create_path=True, overwrite=True)

    print('All files moved successfully.')

In [217]:
# Set up path prefixes
#today = date.today()
#history = today - timedelta(days=1) # YYYY-MM-DD Modify based on current date and data availability in raw folder (history)
#fileprefix = history.strftime("%Y/%m/%d/")
fileprefix = datetime.now().strftime("%Y/%m/%d/")
raw_path = f"{raw_adls_path}Funnel/{fileprefix}"
gold_path = f"{gold_adls_path}Funnel"

# Get max DqLoadDatetime from the Delta table
gold_df = spark.read.format("delta").load(gold_path)
max_loaddatetime_row = gold_df.select(max("DqLoadDatetime").alias("max_loaddatetime")).collect()
max_loaddatetime = max_loaddatetime_row[0]["max_loaddatetime"]
    
#Look for all files in Funnel root folder
raw_file_paths = mssparkutils.fs.ls(raw_path)

#Filter the list of files to only get the ones with modifyTime after our watermark time
new_raw_files_to_process = [
    f for f in raw_file_paths
    if f.isFile and datetime.utcfromtimestamp(f.modifyTime / 1000) > max_loaddatetime
]

#Output results - these are the files to process since the last watermark.
for f in new_raw_files_to_process:
    print(f"{f.name} - {datetime.utcfromtimestamp(f.modifyTime / 1000)}")

#Skip ingestion if no new files to process
if len(new_raw_files_to_process) == 0:
    print("All Funnel files already processed. Skipping ingestion.")
    notebookutils.mssparkutils.notebook.exit(0)

#Load the raw Funnel CSV files
files_to_load = [f.path for f in new_raw_files_to_process]

funnel_df = spark.read.format("csv") \
    .option("compression", "gzip") \
    .option("header", True) \
    .load(files_to_load)

print(f"Loaded {len(files_to_load)} file(s) into DataFrame.")

# Check if the DataFrame is empty
if funnel_df.count() == 0 and (env_var == env_dict['prod'] or env_var == env_dict['prod_backup']):
    print("Funnel Data File is empty. Exiting the notebook.")
    response = requests.post(
        'https://prod-85.eastus.logic.azure.com:443/workflows/7367758ef3da4d76b4e64670220d6135/triggers/manual/paths/invoke?api-version=2016-10-01&sp=%2Ftriggers%2Fmanual%2Frun&sv=1.0&sig=3qXzAxaCqincRFQ358oeDwq_SGn5vgzaNgW26QxUDMs',
        '{"email_to": "ededl@wwgroups.net", "email_subject": "Funnel Data File is Empty", "email_body": "Did not find any records in the file to process for daily Funnel data.", "email_from": "AzureSynapse@wwwinc.com"}',
        headers={"Content-Type": "application/json"}
    )
    # Exit the notebook
    notebookutils.mssparkutils.notebook.exit(0)

elif funnel_df.count() == 0 and (env_var == env_dict['dev'] or env_var == env_dict['test']):
    # For Dev and Test environments
    print("Funnel data file in Dev/Test environment is empty. Skipping file processing.")
    
    # Exit the notebook when no data are available
    notebookutils.mssparkutils.notebook.exit(0)
else:

    # Continue with the processing if there is data
    print("File contains data. Proceeding with further processing.")


In [213]:
# Apply Pascal Case for Columns
funnel_df = funnel_df\
            .withColumn("dq_load_datetime", current_timestamp())\
            .select(
               col("custom_market").alias("CustomMarket"),
               col("custom_tactic").alias("CustomTactic"),
               col("campaign").alias("Campaign"),
               col("FB_Campaign_Type_PRORETConsideration").alias("FBCampaignTypePRORETConsideration"),
               col("G_Ads_Campaign_Type_searchPmaxYTDisplay").alias("GAdsCampaignTypeSearchPmaxYTDisplay"),
               col("media_type").alias("MediaType"),
               col("date").alias("Date"),
               col("month").alias("Month"),
               col("month_number").alias("MonthNumber"),
               col("week").alias("Week"),
               col("year").alias("Year"),
               col("data_source_type").alias("DataSourceType"),
               col("data_source_id").alias("DataSourceId"),               
               col("currency").alias("Currency"),              
               col("clicks").alias("Clicks"),               
               col("cost").alias("Cost"),
               col("impressions").alias("Impressions"),
               col("ad_platform_transactions").alias("AdPlatformTransactions"),
               col("dq_load_datetime").alias("DqLoadDatetime")
            )\
    .distinct() 

In [214]:
#Write dataframe to bronze 

bronze_path = f"{bronze_adls_path}Funnel"

if DeltaTable.isDeltaTable(spark, bronze_path):
    delta_table = DeltaTable.forPath(spark, bronze_path)

  # Collect distinct Date values
    date_values = [row["Date"] for row in funnel_df.select("Date").distinct().dropna().collect()]
    
    if date_values:
        # Delete all distinct dates from the existing Delta table
        formatted_dates = ",".join([f"'{d}'" for d in date_values])
        delta_table.delete(f"Date IN ({formatted_dates})")

    # Insert the latest rows
    funnel_df.write.format("delta").mode("append").partitionBy("Date").save(bronze_path)

    print("Deleted old data by Date and inserted updated rows.")

else:
   #Initial table creation if it doesn't exist
    funnel_df.write.option("overwriteSchema", "true") \
        .mode("overwrite") \
        .format("delta") \
        .partitionBy("Date") \
        .save(bronze_path)

    print("Created delta table.")

In [215]:
# Do Transformations for gold
funnel_gold_df = funnel_df\
           .withColumn("dq_load_datetime", current_timestamp())\
           .withColumn("cost", col("cost").cast(DecimalType(38,6)))\
           .withColumn("impressions", col("impressions").cast(DecimalType(38, 0)))\
           .withColumn("clicks", col("clicks").cast(DecimalType(38, 0)))\
           .withColumn("AdPlatformTransactions", col("AdPlatformTransactions").cast(DecimalType(38, 3)))\
           .withColumn("campaign_type_consideration_vs_consideration",lit(None).cast("string"))\
           .withColumn("campaign_type",
            when(
                    (col("GAdsCampaignTypeSearchPmaxYTDisplay").isNotNull()) & (col("GAdsCampaignTypeSearchPmaxYTDisplay") != ""),
                        when(
                                (col("FBCampaignTypePRORETConsideration").isNotNull()) & (col("FBCampaignTypePRORETConsideration") != ""),
                                col("GAdsCampaignTypeSearchPmaxYTDisplay") + lit(" - ") + col("FBCampaignTypePRORETConsideration")
                            ).otherwise(col("GAdsCampaignTypeSearchPmaxYTDisplay"))
                 ).otherwise(col("FBCampaignTypePRORETConsideration"))
                )\
            .select(
               col("date").alias("Date"),
               col("DataSourceId"),
               col("year").alias("Year"),
               col("month").alias("Month"),
               col("MonthNumber"),
               col("week").alias("Week"),
               col("DataSourceType"),
               col("CustomMarket"),
               col("campaign").alias("Campaign"),
               col("CustomTactic"),
               col("campaign_type_consideration_vs_consideration").alias("CampaignTypeConsiderationVsConsideration"),
               col("MediaType"),
               col("campaign_type").alias("CampaignType"),
               col("currency").alias("Currency"),
               col("cost").alias("Cost"),
               col("impressions").alias("Impressions"),
               col("clicks").alias("Clicks"),
               col("AdPlatformTransactions"),
               col("dq_load_datetime").alias("DqLoadDatetime")
            )

In [216]:
#Write dataframe to gold 

gold_path = f"{gold_adls_path}Funnel"

if DeltaTable.isDeltaTable(spark, gold_path):
    delta_table = DeltaTable.forPath(spark, gold_path)

  # Collect distinct Date values
    date_values = [row["Date"] for row in funnel_df.select("Date").distinct().dropna().collect()]
    
    if date_values:
        # Delete all distinct dates from the existing Delta table
        formatted_dates = ",".join([f"'{d}'" for d in date_values])
        delta_table.delete(f"Date IN ({formatted_dates})")

    # Insert the latest rows
    funnel_gold_df.write.format("delta").mode("append").partitionBy("Date").save(gold_path)

    print("Deleted old data by Date and inserted updated rows.")

else:
   #Initial table creation if it doesn't exist
    funnel_gold_df.write.option("overwriteSchema", "true") \
        .mode("overwrite") \
        .format("delta") \
        .partitionBy("Date") \
        .save(gold_path)

    print("Created delta table.")


In [None]:
notebookutils.mssparkutils.notebook.exit(0)

In [201]:
# Load history file

schema = StructType([
    StructField("custom_market", StringType(), True),
    StructField("custom_tactic", StringType(), True),
    StructField("campaign", StringType(), True),
    StructField("FB_Campaign_Type_PRORETConsideration", StringType(), True),
    StructField("G_Ads_Campaign_Type_searchPmaxYTDisplay", StringType(), True),
    StructField("media_type", StringType(), True),
    StructField("date", StringType(), True),
    StructField("month", StringType(), True),
    StructField("month_number", StringType(), True),
    StructField("week", StringType(), True),
    StructField("year", StringType(), True),
    StructField("data_source_type", StringType(), True),
    StructField("data_source_id", StringType(), True),
    StructField("currency", StringType(), True),
    StructField("clicks", StringType(), True),
    StructField("cost", StringType(), True),
    StructField("impressions", StringType(), True),
    StructField("ad_platform_transactions", StringType(), True)
])

raw_path = f"{raw_adls_path}Funnel"
#print(f"{raw_path}")
funnel_df = spark.read.format("csv").option("compression","gzip").option("header",True).schema(schema).option("recursiveFileLookup", "true").load(raw_path)

# Rerun steps above for bronze and gold saves to Delta table