## Connect to ADLS Gen2 (Storage Account)

In [0]:
# Replace with your actual storage account name and key
spark.conf.set(
"fs.azure.account.key.goodreadsreviews60300294.dfs.core.windows.net",
"SdrUSgCnzVYmEhQn9mzu3HtSdzHfZLLnQ+2ofOm7fq4GktiUUs3bZw7qJoD8BXFqtyfzCkDbfKZI+ASt5tp6qQ=="
)


## Load Silver Parquet (Books & Authors)

In [0]:
# Load the books dataset from the silver layer
books = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/processed/books/"
)
# Load the authors dataset from the silver layer
authors = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/processed/authors/"
)
# Display the first few records to confirm the data was loaded correctly
books.show(5)
authors.show(5)
# Display the columns and their data types to verify the schema
books.printSchema()
authors.printSchema()

## Read reviews (Parquet) and profile

In [0]:
from pyspark.sql.functions import col, length, trim, count, when
# Read raw (uncleaned) reviews from the silver layer
reviews = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/processed/reviews/"
)
# Peek at rows and schema
reviews.show(5, truncate=False)
reviews.printSchema()
# Basic profiling: counts and potential issues
total_rows = reviews.count()
null_review_id = reviews.filter(col("review_id").isNull()).count()
null_book_id = reviews.filter(col("book_id").isNull()).count()
null_user_id = reviews.filter(col("user_id").isNull()).count()
null_rating = reviews.filter(col("rating").isNull()).count()
empty_text = reviews.filter( (col("review_text").isNull()) | (trim(col("review_text")) ==
"") ).count()
print(f"Total rows: {total_rows}")
print(f"NULL review_id: {null_review_id}, NULL book_id: {null_book_id}, NULL user_id:{null_user_id}, NULL rating: {null_rating}")
print(f"Empty/NULL review_text: {empty_text}")


## Clean reviews by removing problematic rows

In [0]:
from pyspark.sql.functions import col, trim, length
# Start from the existing Parquet-loaded DataFrame
# (Assumes you already did: reviews = spark.read.parquet(".../processed/reviews/"))
df = reviews
# 1) Drop rows missing critical keys
df = df.filter(
col("review_id").isNotNull() &
col("book_id").isNotNull() &
col("user_id").isNotNull()
)
# 2) Enforce rating to be integer in [1..5]
df = df.withColumn("rating_int", col("rating").cast("int"))
df = df.filter(
col("rating_int").isNotNull() &
(col("rating_int") >= 1) &
(col("rating_int") <= 5)
)
# 3) Normalize text; drop empty or ultra-short reviews (<10 chars after trim)
df = df.withColumn("review_text", trim(col("review_text")))
df = df.filter(
col("review_text").isNotNull() &
(length(col("review_text")) >= 10)
)
# 4) De-duplicate by review_id (keep arbitrary first; refine if you have timestamps)
df = df.dropDuplicates(["review_id"])
# 5) Select final shape
reviews_clean = df.select(
"review_id",
"book_id",
"user_id",
col("rating_int").alias("rating"),
"review_text",
"n_votes",
"date_added"
)


## Persist Cleaned Reviews

In [0]:
# Write the cleaned reviews back to the silver layer (overwrite)
reviews_clean.write.mode("overwrite").parquet(
"abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/processed/reviews/"
)
# Sanity check: re-read from disk and inspect schema and a few rows
reviews_verified = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/processed/reviews/"
)
reviews_verified.printSchema()
reviews_verified.show(5, truncate=False)
print(f"Written cleaned rows: {reviews_verified.count()}")



## HomeWork Part 1

In [0]:
# HomeWork Part 1

from pyspark.sql.functions import col

# Assign aliases to DataFrames for easier column reference
b = books.alias("b")
a = authors.alias("a")
r = reviews_verified.alias("r")

# Join the dataframes
ba = b.join(a, on='author_id', how='inner')
joined = r.join(ba, on='book_id', how='inner')
joined = r.join(ba, on='book_id', how='inner')

# curated reviews gold 
curated_reviews_gold = joined.select(
    col("r.review_id"),
    col("r.book_id"),
    col("b.title"),
    col("a.author_id"),
    col("a.name"),
    col("r.user_id"),
    col("r.rating"),
    col("r.review_text"),
    col("b.language_code").alias("language"),
    col("r.n_votes"),
    col("r.date_added")
)

# Print the schema and display a few rows to verify the result
print("Curated Reviews Gold DataFrame Schema:")
curated_reviews_gold.printSchema()
print("Curated Reviews Gold DataFrame Sample Rows:")
curated_reviews_gold.show(5, truncate=False)

In [0]:
print(curated_reviews_gold)

In [0]:
# Use a catalog/schema you can write to
spark.sql("USE SCHEMA default")

# Persist as a managed Delta table
curated_reviews_gold.write.format("delta").mode("overwrite").saveAsTable("default.curated_reviews")

# Verify
spark.sql("SELECT COUNT(*) AS rows FROM default.curated_reviews").show()
spark.sql("""
  SELECT title, name, rating, language
  FROM default.curated_reviews
  ORDER BY date_added DESC
  LIMIT 10
""").show(truncate=False)


In [0]:
# 1️⃣ Load the curated_reviews table from the default schema
curated_reviews = spark.table("default.curated_reviews")

# 2️⃣ Select the first 100 rows
sample_df = curated_reviews.limit(100)

# 3️⃣ Define the output path (organized under 'processed/reviews')
sample_output_path = (
    "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/"
    "processed/reviews/sample_curated_data/"
)

# 4️⃣ Write the small dataset as a single CSV file (overwrite if exists)
sample_df.coalesce(1).write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(sample_output_path)

print(f"✅ Sample CSV saved to: {sample_output_path}")