In [0]:
from pyspark.sql import functions as F, types as T

BRONZE_PATH = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/delta/bronze_product_reviews"
SILVER_PATH = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/delta/silver_product_reviews"
CHK_SILVER  = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/_chk/eh_consumer_simple/silver"

schema = T.StructType([
    T.StructField("Id",   T.IntegerType()),
    T.StructField("ProductId", T.StringType()),
    T.StructField("UserId",  T.StringType()),
    T.StructField("ProfileName", T.StringType()),
    T.StructField("HelpfulnessNumerator", T.StringType()),
    T.StructField("HelpfulnessDenominator", T.StringType()),
    T.StructField("Score", T.IntegerType()),
    T.StructField("Time", T.TimestampType()),
    T.StructField("Summary", T.StringType()),
    T.StructField("Text",T.StringType())
])

In [0]:
spark.conf.set(
    "fs.azure.account.key.streaminputsa01.dfs.core.windows.net",
    $value
)


In [0]:
bronze_stream = spark.readStream.format("delta").load(BRONZE_PATH)

In [0]:
display(bronze_stream)

In [0]:
silver_stream = (
    bronze_stream
      .withColumn("json_str", F.col("payload").cast("string"))
      .withColumn("parsed",   F.from_json("json_str", schema))
      .where(F.col("parsed").isNotNull())
      .select("parsed.*")                         # keep only parsed columns
      .withWatermark("Time", "10 minutes")  # late data guard
      .dropDuplicates(["Id"])              # idempotency
)


In [0]:
q_silver = (
    silver_stream.writeStream
      .format("delta")
      .option("checkpointLocation", CHK_SILVER)
      .outputMode("append")
      .start(SILVER_PATH)
)

In [0]:
q_silver.lastProgress

In [0]:
silver_table = spark.read.format("delta").load(SILVER_PATH).orderBy(F.desc("Time"))

In [0]:
silver_table.filter(silver_table.HelpfulnessNumerator>=0).select('HelpfulnessNumerator').distinct().show()

In [0]:
from pyspark.sql import functions as F

BRONZE_PATH = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/delta/bronze_product_reviews"

raw_sample = (spark.read.format("delta").load(BRONZE_PATH)
              .select(F.col("enq_ts"),
                      F.col("payload").cast("string").alias("json"))
              .orderBy(F.desc("enq_ts"))
              .limit(10))

display(raw_sample)


enq_ts,json
2025-08-10T13:57:41.010+0000,"{""Id"":94401,""ProductId"":""B000EQYW0E"",""UserId"":""AS2OWHLJE8ROD"",""ProfileName"":""Trusted Tech"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":2,""Time"":1252454400,""Summary"":""Good, as long as they are still fresh"",""Text"":""I bought these chips in May 2009, and they were best eaten by 9\/12\/2009. When I first got them, they were very tasty and exactly what I had expected. I would have rated them 4 or 5 stars then. However starting in early August, there was a definite odd taste that was starting to creep into the chip. And here in early September they are totally inedible. My guess is that the oils used have gone rancid in the chip, since they are using natural oils. Normally this is a very good thing, but the longevity of the product is sacrificed due to it, which is a problem when you are buying chips in 12 bag packages! So if you are purchasing these chips please make sure you are able to eat them quickly, they will not be any good about 1 month before the best eaten by date. I had to throw away about 5 bags....""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94410,""ProductId"":""B000EQYW0E"",""UserId"":""A2DM9OFZL6RR7S"",""ProfileName"":""Dr. Dawn"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1231545600,""Summary"":""Great snack!"",""Text"":""These chips are more expensive in the store, when you can even find them. I have ordered them from Amazon a few times and have received them in good condition every time. Amazon has a great return policy if you ever have a problem with the contents on receipt.""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94402,""ProductId"":""B000EQYW0E"",""UserId"":""APP8XWYYV4PAA"",""ProfileName"":""Debra Chong \""ECommerceMaven\"""",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1248480000,""Summary"":""Great healthier chip snack"",""Text"":""Great for a change in tortilla chips. Tasty and not bland like some blue chips. Not too salty. A bit healthier than most similar snacks.""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94403,""ProductId"":""B000EQYW0E"",""UserId"":""A2J4IRVIMRWPSZ"",""ProfileName"":""A. Crank"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":4,""Time"":1243555200,""Summary"":""chips were stale, but my money was refunded"",""Text"":""I purchased these chips since they are so much cheaper than what they cost on the grocery. they did not expire for about 2 months, but the first 2 or 3 bags we ate were stale, plus many were broken up. I wrote amazon and they did give me a prompt refund, so their customer service is excellent. we have since opened some more bags and they do not seem as stale, but I still do not believe they taste as fresh as the ones from grocery. I may try reordering these since they are such a good deal and hopefully can give a better review the next time.""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94404,""ProductId"":""B000EQYW0E"",""UserId"":""AF7DZ97VNSEWN"",""ProfileName"":""Michael L. Love \""free is a verb\"""",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1236643200,""Summary"":""I agree with Richard, an excellent find"",""Text"":""Do without the salt. These chips are great, and you can taste the excellent unique blue corn. This company is very savvy, so that these chips are also very healthful for other reasons. I eat some about every day. They are really irresistable, so I like the healthful aspect. Be careful not to eat too many! Regards, proclus [...]""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94405,""ProductId"":""B000EQYW0E"",""UserId"":""A1YKVTUDRNJZ7"",""ProfileName"":""Lee"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1236124800,""Summary"":""Blue Corn Chips Taste Really Good"",""Text"":""These corn chips are crispy and taste great. And very few broken chips in the bag (I like my chips whole). I just wish they weren't blue. Why would they go to the trouble of dying them blue?""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94406,""ProductId"":""B000EQYW0E"",""UserId"":""AGHF93XN5A411"",""ProfileName"":""Dan Brehmer"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1234137600,""Summary"":""I love the sesame in these chips, you will too!"",""Text"":""I love sesame in general. It makes foods more satisfying. There are two brands of blue corn sesame chips I know of; this one and one by Kettle. I think I might prefer the Kettle brand a little over this one since they use sprouted corn. Somehow that improves the flavor a little. These are still a 5 star product though. Sadly, I just searched for the Kettle chips and found they have been discontinued, so not only are these a great choice for sesame blue goodness they are your only choice! Of course chips like these are pretty high in fat, so it is not a good idea to eat them too often. They are quite addictive as well. For a little addition to a lunchtime meal you will not be disappointed. I hope you like them as much as I do. -Dan""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94407,""ProductId"":""B000EQYW0E"",""UserId"":""A3S8FG6DVBZLPU"",""ProfileName"":""Chip Champ"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":3,""Time"":1232928000,""Summary"":""NOT a Black Bean chip!"",""Text"":""Although labeled \""Black Bean Chips,\"" unlike the (now unavailable) Trader Joe's version these are CORN chips with some black bean thrown in. That's misleading. They should be labeled Corn & Black Bean chips. They don't taste bad, as far as chips go. But they are definitely nowhere near as good as the old Trader Joe's variety.""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94408,""ProductId"":""B000EQYW0E"",""UserId"":""A28GM4WTGV8ZHE"",""ProfileName"":""willmay \""willmay\"""",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1232755200,""Summary"":""Good quality with a quaint, natural taste"",""Text"":""I've been eating these chips with my lunch every day for over two years now. I love the taste first of all. Unlike two other black bean corn chip products I have tried (and more and more chip makers seem to be making black bean corn chips now), these are what they say they are: corn chips with black beans in them - - no lime or herb or any other kind of \""flavor enhancer\"". I like the price of the chips here on Amazon. A 7.5 ounce bag sells for around $3 a bag, sometimes less when on sale. Here, the case price works out to about $2.25 a bag, - and delivery is free.""}"
2025-08-10T13:57:41.010+0000,"{""Id"":94409,""ProductId"":""B000EQYW0E"",""UserId"":""A2OAYIM13WIXCH"",""ProfileName"":""Mary D. Haper \""Gramma to 3 boys\"""",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1232496000,""Summary"":""Multi Grain Chips good!"",""Text"":""These chips are very good in taste and with more than the usual grams of fiber, a good choice. They came very fresh and in good shape--no crushed bags.""}"


In [0]:
#Checking why numerator and denominator coming as 0 

from pyspark.sql import functions as F

BRONZE_PATH = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/delta/bronze_product_reviews"

raw = (spark.read.format("delta").load(BRONZE_PATH)
       .select(F.col("payload").cast("string").alias("json"))
       .orderBy(F.desc("enq_ts"))
       .limit(5))
display(raw)

json
"{""Id"":283889,""ProductId"":""B001CWSK98"",""UserId"":""A7ZRAEBRCC3VH"",""ProfileName"":""R. Blease"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":4,""Time"":1284336000,""Summary"":""Pretty good! Better than most GF crackers."",""Text"":""Try these if you can't have gluten. Crackers are one of the things I miss most and these are a nice platform for a good cheddar, creamed herring, or smoked salmon. I like the fennel overtones. As anyone who has ever searched for a great (or even good) gluten free cracker it's difficult. With that said, they often are broken and I hate the little foil pouches they come in - you can't reseal them or fold them over. I am looking forward to trying the vegetable Glutinos. I also wish they could be browned a little more - might give them a little more resistance to the teeth. At any rate, give 'em a try.""}"
"{""Id"":283891,""ProductId"":""B001CWSK98"",""UserId"":""A3TKK66EY5Z4SZ"",""ProfileName"":""S. Mitchell \""karmalaw\"""",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1267660800,""Summary"":""Great Texture and Zingy Flavor.... Yum!"",""Text"":""Okay, some people don't like much flavor -- and if you're one of the vanilla middle-of-the-road types who'd never touch a piece of rye bread or add a dash of fennel to a dish, these crackers *may* not be for you.. BUT, that being said, they have a GREAT texture (think Ritz like flaky crunch) with an almost rye bread type of taste (yes -- it's from the fennel -- but the flavor is very close to what you might remember having in a piece of rye bread). The flavor is not overwhelming, but it's there as a lovely addition to a great cracker. I LOVE these crackers. Try them with some salmon salad on top for a treat(use canned pink salmon in place of tuna to make your salad)... whatever you do, realize you're going to want to gobble them down so make sure you have enough of them! If these aren't for you -- try the original or cheese version. Just keep them away from your non-GF friends\/family -- they're so good they'll eat them up before you have a chance to do so.""}"
"{""Id"":283888,""ProductId"":""B001CWSK98"",""UserId"":""A2V71AX1O3FISP"",""ProfileName"":""L. Croker \""Happy Hiker\"""",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1285027200,""Summary"":""Very tasty"",""Text"":""I have served these crackers to family and friends with cheese, dip, etc. Not one of them could tell it was a special cracker! I would definitely recommend these -- very tasty!""}"
"{""Id"":283887,""ProductId"":""B001CWSK98"",""UserId"":""A1IGRXIFN9UTOI"",""ProfileName"":""glutenfreegal"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":5,""Time"":1304121600,""Summary"":""Best GF Crackers I've Tried"",""Text"":""I've been on a GF diet for almost a year now. Before being diagnosed with celiac disease, I loved eating crackers with soup, peanut butter, or other toppings. I was incredibly disappointed knowing I could never enjoy tasty crackers again. Well, after searching the market and trying (and wasting $ on) various GF crackers, I was alarmed. All the GF crackers I tried were terrible. They tasted more like a tortilla chip than a cracker. On a whim, I bought the Glutino crackers in Multigrain. Amazing! Not a ton of flavor, but will satiate most any palate. Great with other items. These are one of my new favorite GF items. I'm excited to try them with soup, or in casseroles. Enjoy!""}"
"{""Id"":283890,""ProductId"":""B001CWSK98"",""UserId"":""A2NCQBZ51UCV1S"",""ProfileName"":""Molly's & Kate's Momma"",""HelpfulnessNumerator"":0,""HelpfulnessDenominator"":0,""Score"":4,""Time"":1279670400,""Summary"":""Good Cracker - Strong, but not overwhelming fennel flavor"",""Text"":""Good solid cracker - I'd say it's like a water cracker with a strong fennel flavor. The fennel flavor was unexpected at first, but I've grown to like these. They taste great with some plain brie. I'll definitely buy these again.""}"


In [0]:
%sql

CREATE SCHEMA IF NOT EXISTS retail.clean;

CREATE TABLE IF NOT EXISTS retail.clean.silver_reviews (
  Id INT NOT NULL,
  ProductId STRING NOT NULL,
  UserId STRING,
  ProfileName STRING,
  HelpfulnessNumerator STRING,
  HelpfulnessDenominator STRING,
  Score INT,
  Time TIMESTAMP,
  Summary STRING,
  Text STRING,
  enq_ts TIMESTAMP,
  rating DOUBLE,
  summary_length INT,
  word_count INT,
  event_time TIMESTAMP,
  event_date DATE      -- for partitioning
)
USING DELTA
PARTITIONED BY (event_date)
TBLPROPERTIES (
  delta.enableChangeDataFeed = true,
  delta.columnMapping.mode = 'name',
  delta.minReaderVersion = 2,
  delta.minWriterVersion = 5
);

-- Guardrails
ALTER TABLE retail.clean.silver_reviews
  SET TBLPROPERTIES (
    'delta.dataSkippingNumIndexedCols'='3'
  );



In [0]:
#Streaming table with upserts  -  so that existing record if any change can be also altered 

from pyspark.sql import functions as F, types as T

BRONZE_PATH = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/delta/bronze_product_reviews"
SILVER_PATH = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/delta/silver_product_reviews"
CHK_SILVER  = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/_chk/eh_consumer_simple/silver"

SILVER_TABLE = "retail.clean.silver_reviews"

schema = T.StructType([
    T.StructField("Id",   T.IntegerType()),
    T.StructField("ProductId", T.StringType()),
    T.StructField("UserId",  T.StringType()),
    T.StructField("ProfileName", T.StringType()),
    T.StructField("HelpfulnessNumerator", T.StringType()),
    T.StructField("HelpfulnessDenominator", T.StringType()),
    T.StructField("Score", T.IntegerType()),
    T.StructField("Time", T.TimestampType()),
    T.StructField("Summary", T.StringType()),
    T.StructField("Text", T.StringType())
])

def clean(col):
    return F.trim(F.regexp_replace(F.regexp_replace(col, r"<br\s*/?>", " "), r"<[^>]+>", ""))

bronze_stream = spark.readStream.format("delta").load(BRONZE_PATH)

silver_rows = (
    bronze_stream
      .withColumn("json_str", F.col("payload").cast("string"))
      .withColumn("p", F.from_json("json_str", schema))
      .where(F.col("p").isNotNull())
      .select(
          F.col("p.Id").alias("Id"),
          F.col("p.ProductId").alias("ProductId"),
          F.col("p.UserId").alias("UserId"),
          F.col("p.ProfileName").alias("ProfileName"),
          F.col("p.HelpfulnessNumerator").alias("HelpfulnessNumerator"),
          F.col("p.HelpfulnessDenominator").alias("HelpfulnessDenominator"),
          F.col("p.Score").alias("Score"),
          F.col("p.Time").alias("Time"),
          clean(F.col("p.Summary")).alias("Summary"),
          clean(F.col("p.Text")).alias("Text"),
          F.col("enq_ts").alias("enq_ts")
      )
      .withColumn("rating",
          F.when((F.col("Score") >= 1) & (F.col("Score") <= 5), F.col("Score").cast("double"))
           .otherwise(F.lit(None).cast("double"))
      )
      .withColumn("summary_length", F.length("Summary"))
      .withColumn("word_count", F.size(F.split(F.regexp_replace(F.col("Text"), r"\s+", " "), " ")))
      .withColumn("event_time", F.coalesce(F.col("Time"), F.col("enq_ts")))
      .withColumn("event_date", F.to_date(F.col("event_time")))
      .where(F.col("Id").isNotNull() & F.col("ProductId").isNotNull())
      .withWatermark("event_time", "30 minutes")
      .dropDuplicates(["Id"])  # micro-batch dedupe
)

def upsert_to_silver(micro_df, batch_id: int):
    micro_df.createOrReplaceTempView("updates")
    spark.sql(f"""
      MERGE INTO {SILVER_TABLE} t
      USING updates u
      ON t.Id = u.Id
      WHEN MATCHED THEN UPDATE SET
        ProductId = u.ProductId,
        UserId = u.UserId,
        ProfileName = u.ProfileName,
        HelpfulnessNumerator = u.HelpfulnessNumerator,
        HelpfulnessDenominator = u.HelpfulnessDenominator,
        Score = u.Score,
        Time = u.Time,
        Summary = u.Summary,
        Text = u.Text,
        enq_ts = u.enq_ts,
        rating = u.rating,
        summary_length = u.summary_length,
        word_count = u.word_count,
        event_time = u.event_time,
        event_date = u.event_date
      WHEN NOT MATCHED THEN INSERT *
    """)

q = (silver_rows.writeStream
      .trigger(processingTime="30 seconds")   # tune as needed
      .option("checkpointLocation", CHK_SILVER)
      .foreachBatch(upsert_to_silver)
      .start())


In [0]:
from pyspark.sql import functions as F, types as T

BRONZE_PATH = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/delta/bronze_product_reviews"
CHK_SILVER_APPEND = "abfss://retail-input-data@streaminputsa01.dfs.core.windows.net/_chk/eh_consumer_simple/silver_append"
SILVER_TABLE = "retail.clean.silver_reviews"

schema = T.StructType([
    T.StructField("Id",   T.IntegerType()),
    T.StructField("ProductId", T.StringType()),
    T.StructField("UserId",  T.StringType()),
    T.StructField("ProfileName", T.StringType()),
    T.StructField("HelpfulnessNumerator", T.StringType()),
    T.StructField("HelpfulnessDenominator", T.StringType()),
    T.StructField("Score", T.IntegerType()),
    T.StructField("Time", T.TimestampType()),
    T.StructField("Summary", T.StringType()),
    T.StructField("Text", T.StringType())
])

def clean(col):
    return F.trim(F.regexp_replace(F.regexp_replace(col, r"<br\s*/?>", " "), r"<[^>]+>", ""))

bronze_stream = spark.readStream.format("delta").load(BRONZE_PATH)

silver_rows = (
    bronze_stream
      .withColumn("json_str", F.col("payload").cast("string"))
      .withColumn("p", F.from_json("json_str", schema))
      .where(F.col("p").isNotNull())
      .select(
          F.col("p.Id").alias("Id"),
          F.col("p.ProductId").alias("ProductId"),
          F.col("p.UserId").alias("UserId"),
          F.col("p.ProfileName").alias("ProfileName"),
          F.col("p.HelpfulnessNumerator").alias("HelpfulnessNumerator"),
          F.col("p.HelpfulnessDenominator").alias("HelpfulnessDenominator"),
          F.col("p.Score").alias("Score"),
          F.col("p.Time").alias("Time"),
          clean(F.col("p.Summary")).alias("Summary"),
          clean(F.col("p.Text")).alias("Text"),
          F.col("enq_ts").alias("enq_ts")
      )
      .withColumn("rating",
          F.when((F.col("Score") >= 1) & (F.col("Score") <= 5), F.col("Score").cast("double"))
           .otherwise(F.lit(None).cast("double"))
      )
      .withColumn("summary_length", F.length("Summary"))
      .withColumn("word_count", F.size(F.split(F.regexp_replace(F.col("Text"), r"\s+", " "), " ")))
      .withColumn("event_time", F.coalesce(F.col("Time"), F.col("enq_ts")))
      .withColumn("event_date", F.to_date(F.col("event_time")))
      .where(F.col("Id").isNotNull() & F.col("ProductId").isNotNull())
      .withWatermark("event_time", "30 minutes")
      .withColumn("sentiment_label",
      F.when(F.col("Score") >= 4, "positive")
      .when(F.col("Score") == 3, "neutral")
      .when(F.col("Score") <= 2, "negative")
      .otherwise("neutral")
       )
      .withColumn("sentiment_score",
      F.when(F.col("Score") >= 4, F.lit(0.9))
      .when(F.col("Score") == 3, F.lit(0.0))
      .when(F.col("Score") <= 2, F.lit(-0.9))
      .otherwise(F.lit(0.0)))
      .dropDuplicates(["Id"])  # dedupe within watermark window only
)

q = (
  silver_rows.writeStream
    .trigger(processingTime="30 seconds")
    .option("checkpointLocation", CHK_SILVER_APPEND)
    .toTable(SILVER_TABLE)   # <-- simple append to UC table
)
