In [0]:


dbutils.widgets.text("categories", '["Books", "Electronics"]', "Categories to Process")

categories_param = dbutils.widgets.get("categories")
print(f" Received parameter from ADF: {categories_param}")

import json
categories = json.loads(categories_param)
print(f" Categories to process: {categories}")


from pyspark.sql import functions as F  
from pyspark.sql.types import *         
from datetime import datetime


storage_account_name = "your account name"
storage_account_key = "your storage account key"

# Set Spark configuration to access Azure Data Lake
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    storage_account_key
)

bronze_base_path = f"abfss://bronze@{storage_account_name}.dfs.core.windows.net"
silver_base_path = f"abfss://silver@{storage_account_name}.dfs.core.windows.net"

# Define categories we're processing
#categories = ["Electronics", "Books"]

print(" Configuration complete!")
print(f" Bronze path: {bronze_base_path}")
print(f" Silver path: {silver_base_path}")




sample_path = f"{bronze_base_path}/reviews/*"

# Read JSON with schema inference (Spark figures out the structure)
df_sample = spark.read.json(sample_path, multiLine=False)

print(" Raw Data Schema:")
df_sample.printSchema()

print("\n Sample Records (first 5 rows):")
df_sample.show(5, truncate=False) 

print(f"\n Total Records in : {df_sample.count():,}")

# Check for null values in key fields
print("\n Null Value Counts:")
df_sample.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c) 
    for c in df_sample.columns
]).show()


# 3: Define Explicit Schema (Best Practice)



# Define schema for reviews based on your JSON structure
reviews_schema = StructType([
    StructField("rating", FloatType(), True),           # Star rating (1-5)
    StructField("title", StringType(), True),            # Review title
    StructField("text", StringType(), True),             # Review content
    StructField("images", ArrayType(                     # Array of image objects
        StructType([
            StructField("small_image_url", StringType(), True),
            StructField("medium_image_url", StringType(), True),
            StructField("large_image_url", StringType(), True),
            StructField("attachment_type", StringType(), True)
        ])
    ), True),
    StructField("asin", StringType(), True),             # Product ASIN (not used for joins)
    StructField("parent_asin", StringType(), True),      # Parent ASIN (JOIN KEY!)
    StructField("user_id", StringType(), True),          # User identifier
    StructField("timestamp", LongType(), True),          # Unix timestamp (milliseconds)
    StructField("helpful_vote", IntegerType(), True),    # Number of helpful votes
    StructField("verified_purchase", BooleanType(), True) # Verified purchase flag
])

print(" Schema defined!")


# 4: Read & Clean Reviews Data


# Function to process reviews for a single category
def clean_reviews(category_name):
   
    
    print(f"\n{'='*60}")
    print(f" Processing category: {category_name}")
    print(f"{'='*60}")
    
    # 1. READ: Load raw data from Bronze
    input_path = f"{bronze_base_path}/reviews/{category_name}"
    
    print(f" Reading from: {input_path}")
    
    df_raw = spark.read.schema(reviews_schema).json(input_path, multiLine=False)
    
    raw_count = df_raw.count()
    print(f" Raw records: {raw_count:,}")
    
    # 2. CLEAN: Data quality transformations
    df_cleaned = df_raw \
        .dropDuplicates(["user_id", "parent_asin", "timestamp"]) \
        .filter(F.col("parent_asin").isNotNull()) \
        .filter(F.col("rating").isNotNull()) \
        .filter(F.col("rating").between(1, 5)) \
        .withColumn(
            "review_date",
            F.to_timestamp(F.col("timestamp") / 1000)  # Convert Unix ms to timestamp
        ) \
        .withColumn(
            "review_text_length",
            F.length(F.col("text"))  # Calculate review length
        ) \
        .withColumn(
            "has_images",
            F.when(F.size(F.col("images")) > 0, True).otherwise(False)  # Boolean flag
        ) \
        .withColumn(
            "image_count",
            F.size(F.col("images"))  # Count of images
        ) \
        .withColumn(
            "category",
            F.lit(category_name)  
        ) \
        .withColumn(
            "processing_timestamp",
            F.current_timestamp()  # Audit column: when was this processed?
        )
    
    cleaned_count = df_cleaned.count()
    print(f" Cleaned records: {cleaned_count:,}")
    print(f"  Removed records: {raw_count - cleaned_count:,} ({((raw_count - cleaned_count) / raw_count * 100):.2f}%)")
    
    # 3. SELECT: Final columns for Silver layer (drop raw nested fields for now)
    df_final = df_cleaned.select(
        "parent_asin",            # JOIN KEY
        "user_id",                # User identifier
        "rating",                 # Star rating
        "title",                  # Review title
        "text",                   # Review content
        "review_text_length",     # Derived: length of review
        "timestamp",              # Original Unix timestamp
        "review_date",            # Converted timestamp
        "helpful_vote",           # Helpfulness votes
        "verified_purchase",      # Verified flag
        "has_images",             # Derived: boolean
        "image_count",            # Derived: count
        "category",               # Category name
        "processing_timestamp"    # Audit column
    )
    
    return df_final


df_all_reviews = None

for category in categories:
    df_category = clean_reviews(category)
    
    if df_all_reviews is None:
        df_all_reviews = df_category
    else:
        df_all_reviews = df_all_reviews.union(df_category)  # Combine datasets

print(f"\n{'='*60}")
print(f" TOTAL CLEANED REVIEWS: {df_all_reviews.count():,}")
print(f"{'='*60}")

# Display sample of cleaned data
print("\n Sample Cleaned Data:")
df_all_reviews.show(10, truncate=True)

# Show data distribution by category
print("\n Records by Category:")
df_all_reviews.groupBy("category").count().orderBy(F.desc("count")).show()




# Output path for Silver reviews
output_path = f"{silver_base_path}/reviews_cleaned"

print(f"\n Writing to Silver layer: {output_path}")


df_all_reviews.write \
    .mode("overwrite") \
    .partitionBy("category") \
    .parquet(output_path)

print(" Silver layer write complete!")




df_silver_verify = spark.read.parquet(output_path)

print("\n Verification:")
print(f" Total records in Silver: {df_silver_verify.count():,}")
print(f" Schema:")
df_silver_verify.printSchema()


print("\n Data Quality Metrics:")
df_silver_verify.select(
    F.count("*").alias("total_records"),
    F.avg("rating").alias("avg_rating"),
    F.avg("helpful_vote").alias("avg_helpful_votes"),
    F.sum(F.when(F.col("verified_purchase") == True, 1).otherwise(0)).alias("verified_purchases"),
    F.sum(F.when(F.col("has_images") == True, 1).otherwise(0)).alias("reviews_with_images")
).show()

print("\n Notebook 1 Complete! Reviews are now in Silver layer.")

[0;31m---------------------------------------------------------------------------[0m
[0;31mJSONDecodeError[0m                           Traceback (most recent call last)
File [0;32m<command-4699401764271053>, line 7[0m
[1;32m      4[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m"[39m[38;5;124m Received parameter from ADF: [39m[38;5;132;01m{[39;00mcategories_param[38;5;132;01m}[39;00m[38;5;124m"[39m)
[1;32m      6[0m [38;5;28;01mimport[39;00m [38;5;21;01mjson[39;00m
[0;32m----> 7[0m categories [38;5;241m=[39m json[38;5;241m.[39mloads(categories_param)
[1;32m      8[0m [38;5;28mprint[39m([38;5;124mf[39m[38;5;124m"[39m[38;5;124m Categories to process: [39m[38;5;132;01m{[39;00mcategories[38;5;132;01m}[39;00m[38;5;124m"[39m)
[1;32m     11[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m [38;5;28;01mimport[39;00m functions [38;5;28;01mas[39;00m F  

File [0;32m/usr/lib/python3.12/