# Pull and Process files from BazaarVoice SFTP Site

## Initialize Config

### Imports

In [27]:
# We will need an SFTP package along with many of the usuals
from delta import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, LongType
from datetime import datetime
import json
from azure.storage.blob import BlobServiceClient
from notebookutils import mssparkutils
# SFTP
import paramiko
import gzip
import io
from pyspark.sql.functions import col, when, array, explode, expr
import re
from delta.tables import DeltaTable
from pyspark.sql.utils import AnalysisException

### Include Common Functions

In [28]:
%run /utils/common_functions

## Ingest from SFTP to raw

### Retrieve the SFTP secret from AKV

In [29]:

connection_info_string = mssparkutils.credentials.getSecret(kv_name, 'bazaar-voice-sftp-connection-info', 'ls_kv_adap')
connection_info = json.loads(connection_info_string)
host = connection_info['host']
user = connection_info['user']
password = connection_info['password']
sftp_port = 22

print(f"SFTP host: {host}")
print(f"SFTP user: {user}")
print(f"SFTP password len: {len(password)}")


### Get the Files from the SFTP Server and Save to raw

In [30]:
# Get the last watermark datetime
process_name = 'BazaarVoiceSFTP'
# For now, put the watermark delta table under the raw/BazaarVoice folder
# raw_adls_path") abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/ 
delta_table_path = f"{raw_adls_path}BazaarVoice/sftp_watermark"
try:
    watermark_value = get_datetime_watermark_value_for_process(process_name=process_name, delta_table_path=delta_table_path)
    print(f"watermark_value: {watermark_value}")
except Exception as e:
    print("error=",str(e))
    raise Exception(f"Could not retrieve watermark value for {process_name} at {delta_table_path}")

In [31]:
# Just test the watermark functions
# watermark_value = datetime.now()
# set_datetime_watermark_value_for_process(process_name, watermark_value, delta_table_path)
# watermark_value = get_datetime_watermark_value_for_process(process_name, delta_table_path)
# print(f"read watermark_value: {watermark_value}")

# Reset the watermark to 1/1/1900 to get all files in feeds folder that we need.
# process_name = 'BazaarVoiceSFTP'
# watermark_value = datetime(1900, 1, 1)
# set_datetime_watermark_value_for_process(process_name, watermark_value, delta_table_path)

In [32]:
# Connec to the SFTP server and get the files since the last watermark datetime
transport = None
sftp = None
remote_folder = 'feeds'

# Build the relative path for the gzipped XML files, using a simple date partition folder (date=yyyyMMdd)
today_str = datetime.today().strftime('%Y%m%d')
todays_blob_folder = f"/BazaarVoice/date={today_str}"
print(f"todays blob_folder: {todays_blob_folder}")

# Capture the max file mod time to use for our new watermark
# Start with a datetime long ago.
max_file_mod_time = datetime(1900, 1, 1)

try:
    # Connect to the SFTP server
    transport = paramiko.Transport((host, sftp_port))
    transport.connect(username=user, password=password)
    if(not transport.is_active()):
        print("Transport connection is inactive. SFTP client will not be created.")
        transport.close()
        raise Exception("Inactive paramiko transport")
    sftp = paramiko.SFTPClient.from_transport(transport)
    print(f"Connected to {host}...")

    # Get the list of files
    for file_attr in sftp.listdir_attr(remote_folder):
        #print(dir(file_attr))
        file_name = file_attr.filename
        remote_file_path = f"{remote_folder}/{file_name}"

        # Filter off unwanted files.  We only want the two *_standard_client_feed_* files.
        if 'tryzen' in file_name:
            # Skip the bv_sweatybetty_tryzendevelopment* files.
            print(f"Skippping file {file_name}")
            continue

        # Get the file's last modified time and see if we need to set the max file mod time for our new watermark
        file_mod_time = datetime.fromtimestamp(file_attr.st_mtime)
        if file_mod_time > max_file_mod_time:
            max_file_mod_time = file_mod_time

        print('-----------------------------------------------------')
        print(F"Comparing remote file {file_name} last modified {file_mod_time} to watermark_value {watermark_value}")
        if file_mod_time > watermark_value:
            print(f"Downloading {remote_file_path}...")
            # Download the gz file to raw. We can decompress the gz file on read when processing raw into bronze.
            with sftp.open(remote_file_path, "rb") as remote_file:
                file_data = remote_file.read()

            local_file_path = f"{todays_blob_folder}/{file_name}"
            print(f"Saving file {file_name} to {local_file_path}...")

            # Use the same pattern for BLOB uploading to raw as the StoreTech notebook
            account_key = mssparkutils.credentials.getSecret(kv_name, 'storage-key', 'ls_kv_adap')
            blob_service_client = BlobServiceClient(account_url=blob_adls_path, credential=account_key)
            container_client = blob_service_client.get_container_client("raw")
            blob_client = container_client.get_blob_client(local_file_path) # (blob_name)
            blob_client.upload_blob(file_data, overwrite=True)

        else:
            print(f"Skipping {file_name} - file mod time is after our watermark.")

    # Set the new watermark
    print(f"SFTP download complete. Setting new watermark: {max_file_mod_time}")
    set_datetime_watermark_value_for_process(process_name, max_file_mod_time, delta_table_path)
except Exception as e:
    print("error=",str(e))
    raise Exception("Could not connect to SFTP server and get the files.")
finally:
    # Ensure SFTP session is closed
    if sftp:
        sftp.close()
    # Ensure transport is closed
    if transport:
        transport.close()

## Bronze - conform to Delta

### Helper functions to work with the BazaarVoice folders

In [33]:
# Function to tell us if a folder exists
def folder_exists(abfss_path):
    from notebookutils import mssparkutils
    try:
        mssparkutils.fs.ls(abfss_path)
        return True
    except:
        return False

In [34]:
# Function returns a DeltaTable object or None if it doesn't exist
# This does NOT load the data - it's a lightweight op
def get_delta_table(delta_table_path):
    from delta.tables import DeltaTable
    from pyspark.sql.utils import AnalysisException

    delta_table = None
    try:
        delta_table = DeltaTable.forPath(spark, delta_table_path)
    except AnalysisException:
        delta_table = None

    return delta_table

In [35]:
# Function to get the raw BazaarVoice folders to process - after or equal to the input date string (yyyyMMdd)
def get_filtered_date_folders(abfss_path, min_date):
    """
    Lists all folders in the given ABFSS path, filters those that match
    the pattern "date=yyyyMMdd" and are greater than or equal to min_date.
    
    :param abfss_path: The ABFSS root path containing date-based folders.
    :param min_date: The minimum date in "yyyyMMdd" format as a string.
    :return: A sorted list of matching folders.
    """
    from notebookutils import mssparkutils
    import re
    from pyspark.sql.utils import AnalysisException
    
    try:
        # List all items in the given path
        items = mssparkutils.fs.ls(abfss_path)

        # Define regex pattern for "date=yyyyMMdd"
        date_pattern = re.compile(r"date=(\d{8})")

        # Extract valid date folders that meet the condition
        valid_folders = [
            item.name.strip('/') for item in items 
            if (match := date_pattern.match(item.name.strip('/'))) and match.group(1) >= min_date
        ]

        # Return sorted list of valid folders
        return sorted(valid_folders)

    except AnalysisException as e:
        print(f"Error accessing {abfss_path}: {e}")
        return []


### See what we have already loaded into bronze from raw

In [36]:
# Do we have data in bronze already for BazaarVoice?
bv_subfolder = 'BazaarVoice'
bv_bronze_folder_path = f"{bronze_adls_path}{bv_subfolder}"   # abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/
print(bv_bronze_folder_path)

# Does this main BazzarVoice folder path exist? e.g. abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice
if folder_exists(bv_bronze_folder_path):
    print(f"The {bv_subfolder} subfolder exists in {bronze_adls_path}")
    first_time_loading_to_bronze = False
else:
    print(f"Nope - The {bv_subfolder} subfolder does NOT exist in {bronze_adls_path}")
    first_time_loading_to_bronze = True

# If we already have BV bronze folder, see if we have our tables
# We want to use the max date loaded into the bronze delta tables (product_reviews and product_ratings)
bronze_reviews_delta_path = f"{bv_bronze_folder_path}/product_reviews"
bronze_ratings_delta_path = f"{bv_bronze_folder_path}/product_ratings"

bronze_reviews_delta_table = get_delta_table(bronze_reviews_delta_path)
bronze_ratings_delta_table = get_delta_table(bronze_ratings_delta_path)

if first_time_loading_to_bronze:
    # Create the BazaarVoice subfolder in bronze
    mssparkutils.fs.mkdirs(bv_bronze_folder_path)
    # Set our watermark date strings to well before to get ALL files from all folders into bronze.
    
    max_reviews_ingested_at_str = '19000101'
    max_reviews_ingested_at = datetime.strptime(max_reviews_ingested_at_str, "%Y%m%d")
    max_ratings_ingested_at_str = '19000101'
    max_ratings_ingested_at = datetime.strptime(max_ratings_ingested_at_str, "%Y%m%d")

if bronze_reviews_delta_table:
    # Get the latest load date (ingested_at is a datetime, but we can get the date from it)
    max_reviews_ingested_at = (
        bronze_reviews_delta_table
        .toDF()  # Convert DeltaTable to a Spark DataFrame
        .select(max(col("ingested_at")).alias("max_ingested_at"))
        .collect()[0]["max_ingested_at"]  # Extract the value from Row object
    )
    max_reviews_ingested_at_str = max_reviews_ingested_at.strftime("%Y%m%d")
else:
    max_reviews_ingested_at_str = '19000101'
    max_reviews_ingested_at = datetime.strptime(max_reviews_ingested_at_str, "%Y%m%d")

if bronze_ratings_delta_table:
    max_ratings_ingested_at = (
        bronze_ratings_delta_table
        .toDF()  # Convert DeltaTable to a Spark DataFrame
        .select(max(col("ingested_at")).alias("max_ingested_at"))
        .collect()[0]["max_ingested_at"]  # Extract the value from Row object
    )
    max_ratings_ingested_at_str = max_ratings_ingested_at.strftime("%Y%m%d")
else:
    max_ratings_ingested_at_str = '19000101'
    max_ratings_ingested_at = datetime.strptime(max_ratings_ingested_at_str, "%Y%m%d")
    
#print(f"latest_date_loaded_in_bronze: {latest_date_loaded_in_bronze}")
print(f"max_reviews_ingested_at: {max_reviews_ingested_at}")
print(f"max_reviews_ingested_at_str: {max_reviews_ingested_at_str}")
print(f"max_ratings_ingested_at: {max_ratings_ingested_at}")
print(f"max_ratings_ingested_at_str: {max_ratings_ingested_at_str}")

print(f"type(max_reviews_ingested_at): {type(max_reviews_ingested_at)}")
print(f"type(max_ratings_ingested_at): {type(max_ratings_ingested_at)}")

# Get the minimum of these
# TODO: I don't love this strategy.
# This next line is broken
min_ingested_at = __builtins__.min(max_reviews_ingested_at, max_ratings_ingested_at)
min_ingested_at_str = min_ingested_at.strftime("%Y%m%d")

print(f"min_ingested_at: {min_ingested_at}")
print(f"min_ingested_at_str: {min_ingested_at_str}")

### Now get the list of raw folders we need to process based on what we found in bronze

In [37]:
# Get the folders in raw that we need to load, based on what we found in bronze
# TODO: Do we need to do this for both ratings and reviews? Or just use the combined minimum? using the minimum for now.
bv_raw_folder_path = f"{raw_adls_path}{bv_subfolder}"
raw_folders_to_process = get_filtered_date_folders(abfss_path=bv_raw_folder_path, min_date=min_ingested_at_str)
print(F"raw_folders_to_process: {raw_folders_to_process}")

### And finally, get the list of raw files we need to process into bronze

In [38]:
raw_files_to_process = []
for raw_folder in raw_folders_to_process:
    # Now add to the abfss path
    raw_folder_path = f"{bv_raw_folder_path}/{raw_folder}"
    print('-' * 100)
    print(f"Looking for files in: {raw_folder_path}")

    # Get the files in the curent folder
    raw_files_in_folder = mssparkutils.fs.ls(raw_folder_path)

    # Save the whole FileInfo
    for raw_file in raw_files_in_folder:
        raw_file_path = raw_file.path
        raw_files_to_process.append(raw_file)
        print(raw_file_path)

### Functions for xforms

#### product_reviews

In [39]:
# df_final_reviews = transform_reviews_dataframe(
#     df_reviews,
#     feed_type="full",
#     generated_on="2024-04-13",
#     raw_file_modify_time=raw_file_mod_time,
#     raw_file_path=raw_file_path
# )

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, to_json, lit
from pyspark.sql.types import TimestampType

def transform_reviews_dataframe(
    df_reviews: DataFrame,
    feed_type: str,
    generated_on: str,
    raw_file_modify_time,
    raw_file_path: str
) -> DataFrame:
    """
    Transforms the df_reviews DataFrame into the final schema and adds metadata fields.

    Parameters:
    - df_reviews: Spark DataFrame containing review data
    - feed_type: Indicates full or incremental feed
    - generated_on: Date string parsed from file name
    - raw_file_modify_time: Timestamp from raw file's last modified time
    - raw_file_path: ABFSS path of the raw input file

    Returns:
    - A new Spark DataFrame with selected and transformed columns
    """

    return df_reviews.select(
        col("_id").alias("product_id"), 
        col("product_reviews.UserProfileReference._id").alias("user_id"), 
        col("_disabled").alias("disabled"), 
        col("_removed").alias("removed"), 
        
        col("product_reviews._id").alias("review_id"), 
        col("product_reviews.AuthenticationType").alias("authentication_type"),
        col("product_reviews.CampaignId").alias("campaign_id"),
        col("product_reviews.ContentCodes").alias("content_codes"),
        to_json(col("product_reviews.ContextDataValues")).alias("context_data_values"),
        col("product_reviews.DisplayLocale").alias("display_locale"),
        col("product_reviews.Featured").alias("featured"),
        col("product_reviews.FirstPublishTime").alias("first_publish_time"),
        col("product_reviews.Guid").alias("guid"),
        col("product_reviews.LastModificationTime").alias("last_modification_time"),
        col("product_reviews.LastPublishTime").alias("last_publish_time"),
        col("product_reviews.ModerationStatus").alias("moderation_status"),
        col("product_reviews.NetPromoterScore").alias("net_promoter_score"),
        col("product_reviews.NetPromoterComment").alias("net_promoter_comment"),
        col("product_reviews.NumComments").alias("num_comments"),
        col("product_reviews.NumFeedbacks").alias("num_feedbacks"),
        col("product_reviews.NumNegativeFeedbacks").alias("num_negative_feedbacks"),
        col("product_reviews.NumPositiveFeedbacks").alias("num_positive_feedbacks"),
        col("product_reviews.OriginatingDisplayCode").alias("originating_display_code"),
        col("product_reviews.ProductReviewsDeepLinkedUrl").alias("product_reviews_deep_linked_url"),
        col("product_reviews.Rating").alias("rating"),
        col("product_reviews.RatingRange").alias("rating_range"),
        to_json(col("product_reviews.RatingValues.RatingValue")).alias("rating_values"),
        col("product_reviews.RatingsOnly").alias("ratings_only"),
        col("product_reviews.Recommended").alias("recommended"),
        col("product_reviews.ReviewText").alias("review_text"),
        col("product_reviews.ReviewerLocation").alias("reviewer_location"),
        col("product_reviews.ReviewerNickname").alias("reviewer_nickname"),
        col("product_reviews.SendEmailAlertWhenCommented").alias("send_email_alert_when_commented"),
        col("product_reviews.SendEmailAlertWhenPublished").alias("send_email_alert_when_published"),
        col("product_reviews.SubmissionTime").alias("submission_time"),
        col("product_reviews.Title").alias("title"),
        to_json(col("product_reviews.UserProfileReference")).alias("user_profile_reference"),
        col("product_reviews.Videos").alias("videos"),

        lit(feed_type).alias("feed_type"),
        lit(generated_on).cast("date").alias("generated_on"),
        lit(raw_file_modify_time).cast(TimestampType()).alias("ingested_at"),
        lit(raw_file_path).alias("ingested_from")
    )


#### product_ratings

In [40]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, to_json, lit
from pyspark.sql.types import TimestampType

def transform_ratings_dataframe(
    df_raw_product: DataFrame,
    feed_type: str,
    generated_on: str,
    raw_file_modify_time,
    raw_file_path: str
) -> DataFrame:
    """
    Transforms the df_raw_product DataFrame into the final schema and adds metadata fields.

    Parameters:
    - df_raw_product: Spark DataFrame containing product data
    - feed_type: Indicates full or incremental feed
    - generated_on: Date string parsed from file name
    - raw_file_modify_time: Timestamp from raw file's last modified time
    - raw_file_path: ABFSS path of the raw input file

    Returns:
    - A new Spark DataFrame with selected and transformed columns
    """

    return df_raw_product.select(
        col("_disabled").alias("disabled"),
        col("_id").alias("product_id"),
        col("_removed").alias("removed"),
        to_json(col("Attributes")).alias("attributes"),
        to_json(col("Brand")).alias("brand"),
        to_json(col("CategoryItems")).alias("category_items"),
        col("Description").alias("description"),
        to_json(col("Descriptions")).alias("descriptions"),
        to_json(col("EANs")).alias("eans"),
        col("ExternalId").alias("external_id"),
        col("ImageUrl").alias("image_url"),
        to_json(col("ImageUrls")).alias("image_urls"),
        col("Name").alias("name"),
        to_json(col("Names")).alias("names"),
        to_json(col("NativeReviewStatistics")).alias("native_review_statistics"),
        col("NumAnswers").alias("num_answers"),
        col("NumNativeAnswers").alias("num_native_answers"),
        col("NumNativeQuestions").alias("num_native_questions"),
        col("NumQuestions").alias("num_questions"),
        col("NumReviews").alias("num_reviews"),
        col("NumStories").alias("num_stories"),
        col("ProductPageUrl").alias("product_page_url"),
        to_json(col("ProductPageUrls")).alias("product_page_urls"),
        to_json(col("ReviewStatistics")).alias("review_statistics"),
        col("Source").alias("source"),

        lit(feed_type).alias("feed_type"),
        lit(generated_on).cast("date").alias("generated_on"),
        lit(raw_file_modify_time).cast(TimestampType()).alias("ingested_at"),
        lit(raw_file_path).alias("ingested_from")
    )


### Process the raw files into bronze delta tables

In [41]:
# Process each of the files
for raw_file_info in raw_files_to_process:
    # Feed is the root XML tag. Then, we have Product, Category, and UserProfile elements
    # Get file path and other elements from the FileInfo
    #print(dir(raw_file_info)) # 'modifyTime', 'name', 'path', 'size'
    raw_file_path = raw_file_info.path
    print(f"Processing {raw_file_path}")
    raw_file_modify_time = datetime.utcfromtimestamp(raw_file_info.modifyTime / 1000)
    raw_file_name = raw_file_info.name

    # Categorize the file - standard client feed? full or incremental?
    # We really only need to process a single FULL (manually below), then the incremental files only for ongoing processing.
    if 'standard_client_feed' in raw_file_name:
        if 'incremental' in raw_file_name:
            feed_type = 'incremental'
            # Now we can look for the date in the incremental file
            # Parse the "generated on" from the file name
            match = re.search(r'(\d{8})', raw_file_name)
            if match:
                date_str = match.group(1)
                generated_on = datetime.strptime(date_str, "%Y%m%d").date()
            else:
                # We should skip this file since it doesn't have a date string in it.
                # This should never happenm but we're leaving it for safety
                print(f"No date string found in file name. Skipping: {raw_file_name}")
                continue
        else:
            # We don't process full files in the reular daily flow
            print(f"Skipping FULL file: {raw_file_name}")
            continue
    else:
        # We don't process the ratings file currently
        print(f"NOT PROCESSING: {raw_file_name}")
        continue

    print(f"raw_file_name: {raw_file_name}")
    print(f"feed_type: {feed_type}")
    print(f"raw_file_modify_time: {raw_file_modify_time}")
    print(f"generated_on: {generated_on}")

    # Get the Product elements from the XML file
    print(f"Reading XML from file {raw_file_path}...")
    df_raw_product = spark.read.format("xml") \
        .option("rowTag", "Product") \
        .load(raw_file_path)

    # See if our delta tables exist
    bronze_reviews_delta_table = get_delta_table(bronze_reviews_delta_path)
    bronze_ratings_delta_table = get_delta_table(bronze_ratings_delta_path)

    # Reviews **********************************************************************
    # Explode the reviews and filter to only get the Product attributes we need
    df_reviews = df_raw_product.select("_id", "_disabled", "_removed", explode(col("Reviews.Review")).alias("product_reviews"))

    # See what we have so we can dial in the SELECT to match the dbt
    #df_reviews.printSchema()

    # Get the columns we need from the Review.  Do we want to use the dbt aliases or keep the orig where we can? We want to be pascal case in gold anyway.  hmm.
    df_final_reviews = transform_reviews_dataframe(
        df_reviews,
        feed_type=feed_type,
        generated_on=generated_on,
        raw_file_modify_time=raw_file_modify_time,
        raw_file_path=raw_file_path
    )

    # Save/merge to bronze table. I think our keys are just the product_id and review_id
    if bronze_reviews_delta_table:
        print(f"Reviews bronze table exists. Merging data from {raw_file_path}")
        # Merge it
        merge_condition = "target.product_id = source.product_id AND target.review_id = source.review_id and target.ingested_from = source.ingested_from"
        (
            bronze_reviews_delta_table.alias("target")
            .merge(df_final_reviews.alias("source"), merge_condition)
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )
    else:
        print(f"Overwriting bronze reviews delta table with: {raw_file_path} dataframe")
        # Table doesn't exist, just write/save the reviews dataframe to the delta location
        (
            df_final_reviews
            .write
            .format("delta")
            .mode("overwrite")
            .save(bronze_reviews_delta_path)
        )

    # Product Ratings **********************************************************************************************
    df_final_ratings = transform_ratings_dataframe(
        df_raw_product,
        feed_type=feed_type,
        generated_on=generated_on,
        raw_file_modify_time=raw_file_modify_time,
        raw_file_path=raw_file_path
    )

    # Save/merge ratings to bronze table - 
    # Trying the product_id and the source file for now for PKs.
    if bronze_ratings_delta_table:
        print(f"Ratings bronze table exists. Merging data from {raw_file_path}")
        # Merge it
        merge_condition = "target.product_id = source.product_id and target.ingested_from = source.ingested_from"
        (
            bronze_ratings_delta_table.alias("target")
            .merge(df_final_ratings.alias("source"), merge_condition)
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )
    else:
        print(f"Overwriting bronze ratings delta table with: {raw_file_path} dataframe")
        # Just write the reviews dataframe to the delta location
        (
            df_final_ratings
            .write
            .format("delta")
            .mode("overwrite")
            .save(bronze_ratings_delta_path)
        )

print("DONE")

## Silver Processing TODO

## Gold Processing and final Load TODO

## Exit the notebook to keep our miscellaneous code from running automatically

In [None]:
mssparkutils.notebook.exit("0")

# Helper Cells that don't run normally

## Load the initial full file
### This section only runs once, to create the delta tables using the first FULL file. 
### Only incremental files after this load.
### !! Don't forget to also run the two xform function cells above to create the functions: def transform_ratings_dataframe AND def transform_reviews_dataframe

In [None]:
%run /utils/common_functions

In [None]:
def find_first_full_file_path(bv_raw_root_path):
    for item in mssparkutils.fs.ls(bv_raw_root_path):
        if item.isDir:
            item_path = item.path
            item_name = item.name
            print(f"Looking in subfolder {item_name}")
            for file in mssparkutils.fs.ls(item_path):
                if file.isFile and file.name == 'bv_sweatybetty_standard_client_feed.xml.gz':
                    # Found it!
                    return file.path
    return None  # Not found

In [None]:
# Imports
from notebookutils import mssparkutils

# Find our first full file in th raw zone
bv_raw_root_path = f"{raw_adls_path}/BazaarVoice"
full_file_name = 'bv_sweatybetty_standard_client_feed.xml.gz'
print(f"Looking in {bv_raw_root_path} for {full_file_name}")

first_full_file_path = find_first_full_file_path(bv_raw_root_path)
print(f"Processing FULL file: {first_full_file_path}")


In [None]:
# Now let's read the FULL file's Products into our dataframe 
raw_file_path = first_full_file_path
print(f"Reading FULL XML from file {raw_file_path}...")
df_raw_product = spark.read.format("xml") \
    .option("rowTag", "Product") \
    .load(raw_file_path)

print(f"finished reading FULL XML file {raw_file_path}")

### Don't forget to run the xform function creation cells before running this!

In [None]:

# TODO: Get the ingested_at, generated_on, ingested_from, and feed_type vars set
from datetime import datetime
from pyspark.sql.functions import explode

feed_type = "full"

raw_file_info = mssparkutils.fs.ls(raw_file_path)[0]
raw_file_modify_time = datetime.utcfromtimestamp(raw_file_info.modifyTime / 1000) # using this for ingested_at
generated_on = raw_file_modify_time.date()
ingested_from = raw_file_path

print(f"feed_type: {feed_type}")
print(f"generated_on: {generated_on}")
print(f"ingested_at: {raw_file_modify_time}")
print(f"ingested_from: {ingested_from}")

# Explode the reviews and filter to only get the Product attributes we need
print(f"Exploding the Reviews...")
df_reviews = df_raw_product.select("_id", "_disabled", "_removed", explode(col("Reviews.Review")).alias("product_reviews"))

# Get the columns we need from the Review.  Do we want to use the dbt aliases or keep the orig where we can? We want to be pascal case in gold anyway.  hmm.
print(f"Finished exploding Reviews. Creating df_final_reviews...")
df_final_reviews = transform_reviews_dataframe(
    df_reviews,
    feed_type=feed_type,
    generated_on=generated_on,
    raw_file_modify_time=raw_file_modify_time,
    raw_file_path=raw_file_path
)

bv_bronze_folder_path = f"{bronze_adls_path}/BazaarVoice"

bronze_reviews_delta_path = f"{bv_bronze_folder_path}/product_reviews"
print(f"Finished creating the df_final_review. Writing reviews to delta table in bronze {bronze_reviews_delta_path}...")
# Save to bronze table.
(
    df_final_reviews
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .save(bronze_reviews_delta_path)
)

# Product Ratings **********************************************************************************************
print("Finished writing product_reviews delta table in bronze. Building the final ratings dataframe...")
df_final_ratings = transform_ratings_dataframe(
    df_raw_product,
    feed_type=feed_type,
    generated_on=generated_on,
    raw_file_modify_time=raw_file_modify_time,
    raw_file_path=raw_file_path
)

# Saveratings to bronze table - 
# Trying the product_id and the source file for now for PKs.
bronze_ratings_delta_path = f"{bv_bronze_folder_path}/product_ratings"
print(f"Finished bulding the final ratings dataframe. Writing bronze ratings delta tablein bronze {bronze_ratings_delta_path}...")
# Just write the reviews dataframe to the delta location
(
    df_final_ratings
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", True)
    .save(bronze_ratings_delta_path)
)

print("DONE!")

### Check our work

In [None]:
# TODO: Print the schema for the two final dataframes - reviews and ratings
print("df_final_reviews schema:")
df_final_reviews.printSchema()
print("-" * 50)
print("df_final_ratings schema:")
df_final_ratings.printSchema()

In [None]:
print(df_reviews.count())
print(df_final_reviews.count())

## Schema Checks

### Quick check of all existing bronze delta tables

In [None]:
# dev_ratings_delta_path = "abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/product_ratings"
# dev_reviews_delta_path = "abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/product_reviews"
# tst_ratings_delta_path = "abfss://bronze@azwwwnonprodtestadapadls.dfs.core.windows.net/BazaarVoice/product_ratings"
# tst_reviews_delta_path = "abfss://bronze@azwwwnonprodtestadapadls.dfs.core.windows.net/BazaarVoice/product_reviews"
# prd_ratings_delta_path = "abfss://bronze@azwwwprodprdadapadls.dfs.core.windows.net/BazaarVoice/product_ratings"
# prd_reviews_delta_path = "abfss://bronze@azwwwprodprdadapadls.dfs.core.windows.net/BazaarVoice/product_reviews"

from delta import DeltaTable

delta_paths = [
    {"descriptor": "dev_ratings", "abfss_path": "abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/product_ratings"},
    {"descriptor": "dev_reviews", "abfss_path": "abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/product_reviews"},
    {"descriptor": "tst_ratings", "abfss_path": "abfss://bronze@azwwwnonprodtestadapadls.dfs.core.windows.net/BazaarVoice/product_ratings"},
    {"descriptor": "tst_reviews", "abfss_path": "abfss://bronze@azwwwnonprodtestadapadls.dfs.core.windows.net/BazaarVoice/product_reviews"},
    {"descriptor": "prd_ratings", "abfss_path": "abfss://bronze@azwwwprodprdadapadls.dfs.core.windows.net/BazaarVoice/product_ratings"},
    {"descriptor": "prd_reviews", "abfss_path": "abfss://bronze@azwwwprodprdadapadls.dfs.core.windows.net/BazaarVoice/product_reviews"},
]

# Loop through each and print the schema
for entry in delta_paths:
    print(f"Schema for {entry['descriptor']}:")
    delta_table = DeltaTable.forPath(spark, entry["abfss_path"])
    delta_table.toDF().printSchema()
    print("-" * 60)


### Look at inferred schema again and potential struct/array refinement

In [None]:
#abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/
# https://azwwwnonproddevadapadls.blob.core.windows.net/raw/BazaarVoice/date=20250219/bv_sweatybetty_incremental_standard_client_feed_20250120.xml.gz

from pyspark.sql.functions import explode, col, to_json

raw_abfss_path = 'abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/date=20250219/bv_sweatybetty_incremental_standard_client_feed_20250120.xml.gz'
df_raw_product = (
    spark
    .read
    .format("xml")
    .option("rowTag", "Product")
    .load(raw_abfss_path)
)

# Equivalent:
# df_raw_product = spark.read.format("xml") \
#     .option("rowTag", "Product") \
#     .load(raw_file_path)

print(f"df_raw_product loaded from: {raw_abfss_path}")

df_reviews = df_raw_product.select("_id", "_disabled", "_removed", explode(col("Reviews.Review")).alias("product_reviews"))
print("exploded reviews")

df_reviews.printSchema()

In [None]:
# Write to a test delta table
test_bronze_delta_path = "abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/test_product_reviews"
(
    df_final_reviews
    .write
    .format("delta")
    .save(test_bronze_delta_path)
)

## How important are the full files vs incremental.
### Seems like they have the same data. Do a check. NO, the full has hundreds of thousands of reviews.  the incrementals total only 10s of thousands.

### Might only need one full, tho. ?!

### Load a full file and save it as a table.  Then check dates within

In [None]:
# Read a full file.
# https://azwwwnonproddevadapadls.blob.core.windows.net/raw/BazaarVoice/date=20250219/bv_sweatybetty_standard_client_feed.xml.gz
raw_full_file_path = 'abfss://raw@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/date=20250219/bv_sweatybetty_standard_client_feed.xml.gz'

df_full_file_raw_product = (
    spark
    .read
    .format("xml")
    .option("rowTag", "Product")
    .load(raw_full_file_path)
)
print(f"Done reading {raw_full_file_path}")
# 00:01:07 to read - not bad.  But that was without the product rowTag.  Going to be longer with that...Eeek - 10 minutes already. 00:14:10 - yikes.

In [None]:
df_full_file_reviews = df_full_file_raw_product.select("_id", "_disabled", "_removed", explode(col("Reviews.Review")).alias("product_reviews"))
print("Done.")

In [None]:
from pyspark.sql.functions import lit

feed_type = 'full'

raw_file_name = raw_full_file_path
raw_file_path = raw_full_file_path
# Fake these
raw_file_modify_time = datetime.utcnow()
generated_on = datetime.utcnow()

df_full_file_final_reviews = df_full_file_reviews.select(
    col("_id").alias("product_id"), 
    col("product_reviews.UserProfileReference._id").alias("user_id"), 
    col("_disabled").alias("disabled"), 
    col("_removed").alias("removed"), 
    
    col("product_reviews._id").alias("review_id"), 
    col("product_reviews.AuthenticationType").alias("authentication_type"),
    col("product_reviews.CampaignId").alias("campaign_id"),
    col("product_reviews.ContentCodes").alias("content_codes"),
    to_json(col("product_reviews.ContextDataValues")).alias("context_data_values"), # newly JOSN
    col("product_reviews.DisplayLocale").alias("display_locale"),
    col("product_reviews.Featured").alias("featured"),
    col("product_reviews.FirstPublishTime").alias("first_publish_time"),
    col("product_reviews.Guid").alias("guid"),
    col("product_reviews.LastModificationTime").alias("last_modification_time"),
    col("product_reviews.LastPublishTime").alias("last_publish_time"),
    col("product_reviews.ModerationStatus").alias("moderation_status"),
    col("product_reviews.NetPromoterScore").alias("net_promoter_score"),
    col("product_reviews.NetPromoterComment").alias("net_promoter_comment"),
    col("product_reviews.NumComments").alias("num_comments"),
    col("product_reviews.NumFeedbacks").alias("num_feedbacks"),
    col("product_reviews.NumNegativeFeedbacks").alias("num_negative_feedbacks"),
    col("product_reviews.NumPositiveFeedbacks").alias("num_positive_feedbacks"),
    col("product_reviews.OriginatingDisplayCode").alias("originating_display_code"),
    col("product_reviews.ProductReviewsDeepLinkedUrl").alias("product_reviews_deep_linked_url"),
    col("product_reviews.Rating").alias("rating"),
    col("product_reviews.RatingRange").alias("rating_range"),
    # We need to handle when this is an array rather than just a single struct. *********************************************
    to_json(col("product_reviews.RatingValues.RatingValue")).alias("rating_values"),
    # ***********************************************************************************************************************
    col("product_reviews.RatingsOnly").alias("ratings_only"),
    col("product_reviews.Recommended").alias("recommended"),
    col("product_reviews.ReviewText").alias("review_text"),
    col("product_reviews.ReviewerLocation").alias("reviewer_location"),
    col("product_reviews.ReviewerNickname").alias("reviewer_nickname"),
    col("product_reviews.SendEmailAlertWhenCommented").alias("send_email_alert_when_commented"),
    col("product_reviews.SendEmailAlertWhenPublished").alias("send_email_alert_when_published"),
    col("product_reviews.SubmissionTime").alias("submission_time"),
    col("product_reviews.Title").alias("title"),
    to_json(col("product_reviews.UserProfileReference")).alias("user_profile_reference"),
    col("product_reviews.Videos").alias("videos"),  

    # Selecting some metadata columns, too: feed_type, generated_on, ingested_at, ingested_from
    # feed_type - matches the incremental vs non-incremental file names. full is for the non-incremental files.
    lit(feed_type).alias("feed_type"),
    # generated_on, This comes from the file name (parsed above)
    lit(generated_on).cast("date").alias("generated_on"),
    # ingested_at, # the create time (in our raw zone) of the raw file - yes
    lit(raw_file_modify_time).cast(TimestampType()).alias("ingested_at"),
    # ingested_from # the file name
    lit(raw_file_path).alias("ingested_from") # Might want to just get the file name extracted from the abfss path.
)

print("done")

In [None]:
# Save as a test table
(
    df_full_file_final_reviews
    .write
    .format("delta")
    .save("abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/full_file_test")
)

In [None]:
df_full_file_final_reviews.printSchema()

In [None]:
# Min and MAx timestamps
df_full_file_final_reviews.agg(
    min("last_modification_time").alias("min_last_modification_time"),
    max("last_modification_time").alias("max_last_modification_time"),
    min("last_publish_time").alias("min_last_publish_time"),
    max("last_publish_time").alias("max_last_publish_time")
).show(truncate=False)

### Some full vs incremental counts checks...

In [None]:
# get our keys
# merge_condition = "target.product_id = source.product_id AND target.review_id = source.review_id and target.ingested_from = source.ingested_from"
#review_key_columns = ["product_id", "review_id"] # I think this is a sane key
# The count before getting unique
row_count = df_full_file_reviews.count()
print(f"unfiltered row count: {row_count}") # 235,018

df_unique_reviews = df_full_file_reviews.select(
    col("_id").alias("product_id"), 
    col("product_reviews._id").alias("review_id")
).groupBy(
    "product_id",
    "review_id"
).count()

row_count = df_unique_reviews.count()
print(f"distinct key row count: {row_count}") # Also 235,018 - cool.

In [None]:
# Now do we already have these in our bronze table?
dev_reviews_delta_path = "abfss://bronze@azwwwnonproddevadapadls.dfs.core.windows.net/BazaarVoice/product_reviews"
df_bronze = spark.read.format("delta").load(dev_reviews_delta_path)

# We do have these in our raw data:
# col("product_reviews.LastModificationTime").alias("last_modification_time"),
# col("product_reviews.LastPublishTime").alias("last_publish_time"),

In [None]:
df_not_matched = df_unique_reviews.join(
    df_bronze.select("product_id", "review_id").distinct(),
    on=["product_id", "review_id"],
    how="left_anti"
)

In [None]:
print(df_not_matched.count()) # This takes a while...
# 223,414 - yikes

In [None]:
print(df_bronze.count())


In [None]:
total_rows = 50854 + 223414
print(total_rows)
diff = total_rows - row_count 
print(f"diff: {diff}")