In [0]:
"""
Configure Azure Data Lake Access Key

This cell establishes authentication between the Spark session and the Azure 
Data Lake Storage Gen2 account used in the Goodreads project.

It sets the storage account access key required for read and write operations 
through the ABFSS protocol.

Note:
This configuration is session-specific and must be re-run after each cluster restart.
For production environments, secure credential storage methods such as Azure Key Vault 
or Databricks Secrets should be used instead of hardcoding access keys.
"""

spark.conf.set(
    "fs.azure.account.key.goodreadsreviews60302363.dfs.core.windows.net",
    "8aeNipwlgfgeg1YnUzDh8PeVxg0I5MmnwgWEORAqG5WIJ4Q/XsFa5m714y55ZfAzUw3nNaEFM/e8+AStXU0APQ=="
)

In [0]:
"""
Initialize Spark Session and Load Gold-Layer Dataset

This cell creates a Spark session configured for the Goodreads project and 
loads the finalized Gold-layer dataset from Azure Data Lake Storage in Delta format.

Key Steps:
    - Initialize a SparkSession named 'Goodreads Gold Cleaning' for managing 
      distributed data operations.
    - Define the file path to the Gold layer, which stores the cleaned and 
      enriched Goodreads dataset ('features_v1').
    - Load the dataset using the Delta format to ensure ACID compliance and 
      optimized read performance.
    - Display the schema and a sample of records to verify successful loading.

This step confirms that the Gold-layer data is accessible and ready for 
final validation or additional feature engineering.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Goodreads Gold Cleaning").getOrCreate()

gold_path = "abfss://lakehouse@goodreadsreviews60302363.dfs.core.windows.net/gold/features_v1/"
df = spark.read.format("delta").load(gold_path)

df.printSchema()
df.show(5, truncate=False)


root
 |-- review_id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- language: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- date_added: date (nullable = true)
 |-- review_length: integer (nullable = true)

+--------------------------------+--------+------------------------------------------------------------------+---------+----------------+--------------------------------+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
"""
Compute Book-Level Review Statistics

This cell aggregates review data to generate summary statistics for each book in the dataset.

Operations:
    - Group the dataset by 'book_id' to compute key metrics:
        • 'avg_rating_per_book': The average rating assigned to each book.
        • 'total_reviews_per_book': The total number of reviews recorded for each book.
    - Join the aggregated results back to the main DataFrame to enrich it with 
      book-level summary features.

The resulting dataset provides essential indicators of reader engagement and 
overall sentiment toward each book.
"""

book_agg = df.groupBy("book_id") \
    .agg(
        avg("rating").alias("avg_rating_per_book"),
        count("*").alias("total_reviews_per_book")
    )
df = df.join(book_agg, on="book_id", how="left")

df.select("book_id", "rating", "avg_rating_per_book", "total_reviews_per_book").show(truncate=False)

+--------+------+-------------------+----------------------+
|book_id |rating|avg_rating_per_book|total_reviews_per_book|
+--------+------+-------------------+----------------------+
|30739547|5     |4.747404844290657  |289                   |
|13539044|5     |4.003359462486002  |1786                  |
|29074768|4     |4.09375            |64                    |
|13790759|5     |4.090909090909091  |11                    |
|30372977|5     |4.7478991596638656 |238                   |
|3491640 |3     |3.4705882352941178 |68                    |
|21940255|4     |4.1                |10                    |
|6416365 |5     |3.79               |100                   |
|7937843 |2     |4.027589889640441  |5618                  |
|486749  |5     |4.5                |24                    |
|34458568|5     |5.0                |1                     |
|2076399 |4     |4.0                |2                     |
|6352093 |4     |4.220588235294118  |136                   |
|17398829|5     |3.56363

In [0]:
"""
Compute Author-Level Average Ratings

This cell aggregates the dataset at the author level to evaluate the overall 
reader sentiment toward each author’s body of work.

Operations:
    - Group the data by the 'name' column, representing each author.
    - Calculate the mean rating across all books and reviews associated with 
      each author, stored as 'avg_rating_per_author'.
    - Display a sample of the results to verify correct aggregation.

The resulting summary provides insight into author performance and 
average audience reception across all reviewed titles.
"""

author_agg = df.groupBy("name") \
    .agg(
        avg(col("rating")).alias("avg_rating_per_author")    
    )

author_agg.show(10,truncate=False)

+--------------------+---------------------+
|name                |avg_rating_per_author|
+--------------------+---------------------+
|Liliana Hart        |4.08354114713217     |
|Brooke Borel        |3.4                  |
|Bruce R. Mcconkie   |4.333333333333333    |
|Eric Klinenberg     |3.5223880597014925   |
|Ann Rinaldi         |3.7342105263157896   |
|Brian Michael Bendis|3.6745880312228967   |
|Sachiko Murakami    |3.5                  |
|Kyle Adams          |3.9960629921259843   |
|Mara Purnhagen      |3.7992700729927007   |
|Olivia Judson       |4.040816326530612    |
+--------------------+---------------------+
only showing top 10 rows


In [0]:
"""
Add Word Count Feature to Reviews

This cell introduces a new derived column, 'word_count', which measures the 
length of each review based on the number of words it contains.

Process:
    - The 'split()' function divides the review text into individual words using a space delimiter.
    - The 'size()' function counts the number of resulting elements (words) in each review.

This feature provides a basic measure of review verbosity and can be used for 
text-based analysis, quality assessment, or correlation studies with rating behavior.
"""

# add word column
from pyspark.sql.functions import size, split

df = df.withColumn("word_count", size(split(col("review_text"), " ")))

In [0]:
"""
Compute Comprehensive Word Count Statistics per Book

This cell performs detailed text-based aggregation to quantify the distribution 
and variability of review lengths (in words) for each book.

Steps:
    1. Group the dataset by 'book_id' to compute descriptive statistics for the 'word_count' column:
        • avg_word_count   – Mean number of words per review.
        • min_word_count   – Shortest review length.
        • max_word_count   – Longest review length.
        • stddev_word_count – Standard deviation of review lengths.
        • q1_word_count, median_word_count, q3_word_count – 25th, 50th, and 75th percentiles.
        • total_reviews – Total number of reviews per book.
    2. Display a preview of the aggregated statistics for validation.
    3. Join the aggregated results back to the main DataFrame ('df') to enrich it 
       with book-level textual metrics.

These statistics provide a comprehensive overview of review verbosity patterns 
and variation across different books, supporting deeper sentiment and engagement analysis.
"""

from pyspark.sql.functions import avg, min, max, stddev, count, expr

# 1️⃣ Group by book_id and compute all word count statistics
word_stats = df.groupBy("book_id").agg(
    avg("word_count").alias("avg_word_count"),
    min("word_count").alias("min_word_count"),
    max("word_count").alias("max_word_count"),
    stddev("word_count").alias("stddev_word_count"),
    expr("percentile_approx(word_count, 0.25)").alias("q1_word_count"),
    expr("percentile_approx(word_count, 0.5)").alias("median_word_count"),
    expr("percentile_approx(word_count, 0.75)").alias("q3_word_count"),
    count("*").alias("total_reviews")
)

# 2️⃣ Show the aggregated statistics
word_stats.show(10, truncate=False)

# 3️⃣ Join back to main dataframe for enrichment
df = df.join(word_stats, on="book_id", how="left")

# 4️⃣ Display a compact preview
df.select(
    "book_id", "word_count",
    "avg_word_count", "min_word_count", "max_word_count",
    "stddev_word_count", "q1_word_count", "median_word_count",
    "q3_word_count", "total_reviews"
).show(10, truncate=False)


+--------+------------------+--------------+--------------+------------------+-------------+-----------------+-------------+-------------+
|book_id |avg_word_count    |min_word_count|max_word_count|stddev_word_count |q1_word_count|median_word_count|q3_word_count|total_reviews|
+--------+------------------+--------------+--------------+------------------+-------------+-----------------+-------------+-------------+
|18051981|299.0             |2             |1018          |423.9351365480337 |4            |139              |332          |5            |
|6489997 |41.42857142857143 |2             |169           |36.834184433174876|17           |38               |50           |21           |
|625602  |73.0              |8             |293           |71.75043553874778 |36           |45               |104          |17           |
|25918853|105.0909090909091 |4             |262           |98.54486749238089 |24           |81               |229          |11           |
|18273367|221.5903614457831

In [0]:
"""
Save Enriched Gold-Layer Dataset in Delta Format

This cell writes the fully enriched DataFrame, which now includes statistical 
features and aggregated metrics, back to the Gold layer of the Azure Data Lake 
in Delta format.

Configuration Details:
    - format("delta"): Ensures optimized storage with ACID transaction support.
    - mode("overwrite"): Replaces any existing dataset at the target location.
    - option("overwriteSchema", "true"): Synchronizes the stored schema with the 
      current DataFrame structure.

Destination Path:
    abfss://lakehouse@goodreadsreviews60302363.dfs.core.windows.net/gold/features_v1/

This final step publishes the enhanced, analysis-ready dataset to the Gold layer 
for downstream analytics, visualization, or machine learning tasks.
"""

df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("abfss://lakehouse@goodreadsreviews60302363.dfs.core.windows.net/gold/features_v1/")

In [0]:
"""
Verify Saved Gold-Layer Dataset

This cell reloads the saved Delta dataset from the Gold layer to confirm 
successful storage and schema integrity after the enrichment process.

Verification Steps:
    - Load the 'features_v1' dataset from the Gold layer in Delta format.
    - Print the schema to validate column structure and data types.
    - Display a sample of records for visual inspection.
    - Count the total number of records to ensure completeness.

This verification step ensures that the final dataset has been written correctly 
and is fully accessible for subsequent analytical or modeling tasks.
"""

# Reload the saved dataset from Gold
gold_verified = spark.read.format("delta").load(
    "abfss://lakehouse@goodreadsreviews60302363.dfs.core.windows.net/gold/features_v1/"
)

gold_verified.printSchema()
gold_verified.show(10, truncate=False)
gold_verified.count()

root
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- language: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- date_added: date (nullable = true)
 |-- review_length: integer (nullable = true)
 |-- avg_rating_per_book: double (nullable = true)
 |-- total_reviews_per_book: long (nullable = true)
 |-- word_count: integer (nullable = true)
 |-- avg_word_count: double (nullable = true)
 |-- min_word_count: integer (nullable = true)
 |-- max_word_count: integer (nullable = true)
 |-- stddev_word_count: double (nullable = true)
 |-- q1_word_count: integer (nullable = true)
 |-- median_word_count: integer (nullable = true)
 |-- q3_word_count: integer (nullable = true)
 |-- total_reviews: long (nullable = 

14971370