In [0]:
spark.conf.set(
    
)

base = "abfss://lakehouse@goodreadsreviews60105179.dfs.core.windows.net/gold"
input_path  = f"{base}/features_v1"     # input from Lab 3
output_base = f"{base}/features_v2"     # output for Lab 4


In [0]:
# Load cleaned Gold dataset from Lab 3 (features_v1)
df = spark.read.format("delta").load(input_path)

# Quick preview to confirm it's loaded
display(df.limit(5))
df.printSchema()





book_id,review_id,title,author_id,author_name,user_id,rating,review_text,language_code,n_votes,date_added,review_length,review_length_in_words,average_rating,number_of_reviews
7663760,fa241e939d2218940d17cff6fc30bca4,fooling some of the people all of the time: a long short (and now complete) story,1397426,david einhorn,0c5b36407771dfd65acdd812ecd51705,4.0,"investing is an obsession and not a job.i think few exemplify that more than david einhorn, one of the smartest fundamental investors in the current era. in a world, where short-sellers are castigated as profiteers and trouble makers, mr einhorn demonstrates through his most public short that fundamental short-selling is the only tool to expose fraud and corruption in financial markets.his appeal to regulators to close loopholes that allow companies to feed off tax-payer dollars is genuine but it's difficult to be an optimist when incentives are so perverse despite all the outrage in the aftermath of the financial crisis.all i can say is that mr einhorn and his ilk, continue to be the de-facto regulators of a broken financial system and if they profit from that, well isn't it just right when you see the breadth of the work they put into doing so.",eng,2071,2015-04-07,860,145,3.125,8
22387890,2628b9004c4b6710aa0c20ce094da2c1,deep water,5341919,coral moore,b78a9143ca2f0c4c7361694dc6cb0500,3.0,i liked this story despite not being entirely sold on the attraction between mario and jordan which result pretty much in insta-love. but it was still a nice short and sweet story.,eng,30,2014-06-02,180,32,3.5,2
1096390,ec6f00823d47459dc2c70fea9559c605,the uncommon reader,11781,alan bennett,c3fa377cfc84401747630b98f92758c9,4.0,cute story with lots of british humor on the perils of reading too much. like that could ever happen!,eng,17355,2010-01-05,101,19,3.9038031319910513,447
13166894,e995dd91ce98a48306e5be8d15f97c8d,death at seaworld: shamu and the dark side of killer whales in captivity,6435477,david kirby,ddc44923909c38b4d149a38431105943,4.0,"this was a good book with a very important message that everybody should at least be aware of. while i agree with a lot of the position this book took,i do have a few notes: - it clearly had an agenda, which is okay, but even though i share the same position on whales in captivity, it is quite biased and only gives a small glimpse into the other side of the debate. - it get's extremely repetitive. there were points where i had to check to make sure i hadn't lost my place because it felt like i had just read the exact same thing in previous chapters. - personally, while i was completely interested (especially being that orcas are my favourite animal), there were points, particularly when discussing the legal battle issues where i started to glaze over a little bit and speed read to get through it. all in all though, this was a very thought-provoking read that i overall very much enjoyed.",eng,15,2013-07-10,907,166,4.0,1
25430624,d6120da6fdef79aaa9e11c623e4b9559,abc dream,8108153,kim krans,419dfd723edeb5e27f50aa2382f9aa86,5.0,"concept: 5 stars art: 5 stars alphabet book with marvelous illustrations that include various items for each letter. in each illustration, readers discern all the words they can that begin with the featured letter, then check their answers against the list in the back of the book. fun, interactive way to learn the alphabet.",,122,2017-01-02,329,54,4.545454545454546,11


root
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- review_text: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- date_added: date (nullable = true)
 |-- review_length: integer (nullable = true)
 |-- review_length_in_words: integer (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- number_of_reviews: long (nullable = true)



# Splitting the dataset 

In [0]:
# Split the data into train (70%), validation (15%), and test (15%)
train_df, val_df, test_df = df.randomSplit([0.7, 0.15, 0.15], seed=60105179)

# Save these splits in the Gold layer under features_v2
train_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{output_base}/train_raw")
val_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{output_base}/val_raw")
test_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"{output_base}/test_raw")

# Print record counts to confirm
print(f"Train rows: {train_df.count()}")
print(f"Validation rows: {val_df.count()}")
print(f"Test rows: {test_df.count()}")


Train rows: 6759016
Validation rows: 1446141
Test rows: 1447906


# Text Cleaning and Normalization

In [0]:
%pip install emoji


Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# --- Import required libraries ---
import re, string, emoji
from pyspark.sql.functions import udf, col, length
from pyspark.sql.types import StringType

# --- Step 1: Define the text cleaning function ---
def clean_text(t):
    if t is None:
        return ""
    t = t.lower()  # convert to lowercase
    t = re.sub(r'(https?://\S+|www\.\S+)', ' <URL> ', t)  # replace URLs
    t = emoji.replace_emoji(t, replace=' <EMOJI> ')        # replace emojis
    t = re.sub(r'\b\d+(\.\d+)?\b', ' <NUM> ', t)           # replace numbers
    t = t.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    t = re.sub(r'\s+', ' ', t).strip()                     # remove extra spaces
    return t

# --- Step 2: Register the UDF ---
clean_udf = udf(clean_text, StringType())

# --- Step 3: Apply cleaning directly to 'review_text' column ---
train_df = train_df.withColumn("review_text", clean_udf(col("review_text")))

# --- Step 4: Filter out empty or very short reviews (<10 characters) ---
train_df = train_df.filter(length(col("review_text")) >= 10)

# --- Step 5: Preview cleaned text ---
display(train_df.select("review_text").limit(5))


review_text
spoiler alert girl meets lion lion meets burglar burglar meets nasty end girl cleans up evidence the end
a little gruesome the robber gets munch out of existence if it doesnt got over your head but my son loved this book the drawings are fun and the rhyming story is cute
an important word to the church you can be an obstacle or you can be a nurturer of gifts and callings of young people called to ministry my full review is found here URL
jocelyn dreams of max almost every night but last night was different its christmas morning and last nights dreams weigh heavy on her mind as she waits for her visiting old college room mate to wake to talk to her first a ghost here crazy grandmothers ghost warning and wanting to help her then three visitors after that and not all were welcomed guests past present and future men collide in one night oh this novella is a very interesting and sexy vampire twist on a christmas carol oh the hints in this novella there is something special about jocelyn what im curious max is remembered and im curious how he manages to do this the current boyfriend that cute guy named chad from the dance in of course i tryyeah its him yum jocelyn has been trying to be stronger since max shes started selfdefense classes and fighting better along with becoming more physically fit than she ever has i have to say i love the description of the ghost yes its someone she knew shes the antithesis of the fairy godmother id always wished for all she needs now is a lit cigar between her fingers to totally bastardize that particular childhood fantasy the telling of this story starts with jocelyn waiting for kait then when she comes out we get the front row seat of the happenings as we are there with her then we are back to the morning and kait leaves and jocelyn gets ready for christmas at her mothers and waits for her boyfriend we seen clues that last night might not have been a dream but then things change and answers arent connecting with what happened so maybe it all was after being out with the girls drinking and all im left curious as to what was real and what was a dream i think its all real but there is more than real and not here theres secrets i want to know what they are there is something special about jocelyn but im not sure what exactly it is ill definitely be reading the first novel kiss of death very soon to figure it all out this is definitely a romance with intimate moments
NUM out of NUM stars why and how to forgive and reap the benefits october NUM NUM by becca chopra author of the chakra diaries big island of hawaii see all my reviews vine voice this review is from forgive to win end selfsabotage get everything you want paperback buddha said holding on to anger is like grasping a hot coal with the intent of throwing it at someone else you are the one who gets burned counseling a client with this wise saying she retorted forgiveness is not a concept in my religion well we are all holding on to anger of one sort or another against one parent or another an exlover exhusband exemployer etc and forgive to win finally explains fully and completely why its necessary and in our own best interests to let it go whatever your beliefs there are many wonderful books on forgiveness already on my bookshelf but i welcomed this one because it lays out a structured daily program its not just a philosophical treatise the steps suggested help train ourselves to let go of selfsabotaging behavior and learn to love ourselves by loving and forgiving others no matter what theyve done once our selfesteem is raised to the heights possible through selflove theres nothing you cant achieve forgiveness may seem like an easy task but its not and dr jacobson guides the reader in how to forgive his forgiveness diet is a unique set of recommendations to help us establish and maintain the NUM day commitment to acts of kindness and forgiveness that could literally change ones life he also includes forgiveness affirmations and visualizations that i have already used to great benefit looking at it from the vantage point of health anger throws us offbalance and creates tension that can lead to chronic pain and disease forgiveness opens the floodgates of our bodys own healing energy and keeps us grounded alert empowered and able to recognize opportunities for success the last chapter of the book is entitled getting everything you want if you dont already have it its time to get this book namaste becca chopra author of the chakra diaries URL


# Extract text-based features


### Basic Text Features

In [0]:
# --- Basic Text Features ---

from pyspark.sql.functions import size, split, length

# Add columns for number of words and characters
train_df = train_df.withColumn("review_length_words", size(split(col("review_text"), " ")))
train_df = train_df.withColumn("review_length_chars", length(col("review_text")))

# Preview results
display(train_df.select("review_text", "review_length_words", "review_length_chars").limit(5))


review_text,review_length_words,review_length_chars
spoiler alert girl meets lion lion meets burglar burglar meets nasty end girl cleans up evidence the end,18,104
a little gruesome the robber gets munch out of existence if it doesnt got over your head but my son loved this book the drawings are fun and the rhyming story is cute,33,166
an important word to the church you can be an obstacle or you can be a nurturer of gifts and callings of young people called to ministry my full review is found here URL,34,169
jocelyn dreams of max almost every night but last night was different its christmas morning and last nights dreams weigh heavy on her mind as she waits for her visiting old college room mate to wake to talk to her first a ghost here crazy grandmothers ghost warning and wanting to help her then three visitors after that and not all were welcomed guests past present and future men collide in one night oh this novella is a very interesting and sexy vampire twist on a christmas carol oh the hints in this novella there is something special about jocelyn what im curious max is remembered and im curious how he manages to do this the current boyfriend that cute guy named chad from the dance in of course i tryyeah its him yum jocelyn has been trying to be stronger since max shes started selfdefense classes and fighting better along with becoming more physically fit than she ever has i have to say i love the description of the ghost yes its someone she knew shes the antithesis of the fairy godmother id always wished for all she needs now is a lit cigar between her fingers to totally bastardize that particular childhood fantasy the telling of this story starts with jocelyn waiting for kait then when she comes out we get the front row seat of the happenings as we are there with her then we are back to the morning and kait leaves and jocelyn gets ready for christmas at her mothers and waits for her boyfriend we seen clues that last night might not have been a dream but then things change and answers arent connecting with what happened so maybe it all was after being out with the girls drinking and all im left curious as to what was real and what was a dream i think its all real but there is more than real and not here theres secrets i want to know what they are there is something special about jocelyn but im not sure what exactly it is ill definitely be reading the first novel kiss of death very soon to figure it all out this is definitely a romance with intimate moments,372,1992
NUM out of NUM stars why and how to forgive and reap the benefits october NUM NUM by becca chopra author of the chakra diaries big island of hawaii see all my reviews vine voice this review is from forgive to win end selfsabotage get everything you want paperback buddha said holding on to anger is like grasping a hot coal with the intent of throwing it at someone else you are the one who gets burned counseling a client with this wise saying she retorted forgiveness is not a concept in my religion well we are all holding on to anger of one sort or another against one parent or another an exlover exhusband exemployer etc and forgive to win finally explains fully and completely why its necessary and in our own best interests to let it go whatever your beliefs there are many wonderful books on forgiveness already on my bookshelf but i welcomed this one because it lays out a structured daily program its not just a philosophical treatise the steps suggested help train ourselves to let go of selfsabotaging behavior and learn to love ourselves by loving and forgiving others no matter what theyve done once our selfesteem is raised to the heights possible through selflove theres nothing you cant achieve forgiveness may seem like an easy task but its not and dr jacobson guides the reader in how to forgive his forgiveness diet is a unique set of recommendations to help us establish and maintain the NUM day commitment to acts of kindness and forgiveness that could literally change ones life he also includes forgiveness affirmations and visualizations that i have already used to great benefit looking at it from the vantage point of health anger throws us offbalance and creates tension that can lead to chronic pain and disease forgiveness opens the floodgates of our bodys own healing energy and keeps us grounded alert empowered and able to recognize opportunities for success the last chapter of the book is entitled getting everything you want if you dont already have it its time to get this book namaste becca chopra author of the chakra diaries URL,361,2069


### Sentiment Features

In [0]:
%pip install nltk


Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.5 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.7 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━

In [0]:
# --- Sentiment Features ---

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# Download VADER lexicon 
nltk.download('vader_lexicon')

# Create the analyzer on the driver
sia = SentimentIntensityAnalyzer()

# Define helper UDFs for each sentiment score
@udf(DoubleType())
def get_sent_pos(text):
    return float(sia.polarity_scores(text)['pos']) if text else None

@udf(DoubleType())
def get_sent_neg(text):
    return float(sia.polarity_scores(text)['neg']) if text else None

@udf(DoubleType())
def get_sent_neu(text):
    return float(sia.polarity_scores(text)['neu']) if text else None

@udf(DoubleType())
def get_sent_compound(text):
    return float(sia.polarity_scores(text)['compound']) if text else None

# Apply UDFs to add sentiment columns
train_df = (
    train_df
    .withColumn("sentiment_pos", get_sent_pos(col("review_text")))
    .withColumn("sentiment_neg", get_sent_neg(col("review_text")))
    .withColumn("sentiment_neu", get_sent_neu(col("review_text")))
    .withColumn("sentiment_compound", get_sent_compound(col("review_text")))
)

# Preview a few rows
display(train_df.select("review_text", "sentiment_pos", "sentiment_neg", "sentiment_neu", "sentiment_compound").limit(5))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/spark-6a3e73ef-c784-4316-b3b1-4b/nltk_data...


review_text,sentiment_pos,sentiment_neg,sentiment_neu,sentiment_compound
spoiler alert girl meets lion lion meets burglar burglar meets nasty end girl cleans up evidence the end,0.101,0.165,0.734,-0.34
a little gruesome the robber gets munch out of existence if it doesnt got over your head but my son loved this book the drawings are fun and the rhyming story is cute,0.314,0.049,0.637,0.9278
an important word to the church you can be an obstacle or you can be a nurturer of gifts and callings of young people called to ministry my full review is found here URL,0.126,0.067,0.806,0.296
jocelyn dreams of max almost every night but last night was different its christmas morning and last nights dreams weigh heavy on her mind as she waits for her visiting old college room mate to wake to talk to her first a ghost here crazy grandmothers ghost warning and wanting to help her then three visitors after that and not all were welcomed guests past present and future men collide in one night oh this novella is a very interesting and sexy vampire twist on a christmas carol oh the hints in this novella there is something special about jocelyn what im curious max is remembered and im curious how he manages to do this the current boyfriend that cute guy named chad from the dance in of course i tryyeah its him yum jocelyn has been trying to be stronger since max shes started selfdefense classes and fighting better along with becoming more physically fit than she ever has i have to say i love the description of the ghost yes its someone she knew shes the antithesis of the fairy godmother id always wished for all she needs now is a lit cigar between her fingers to totally bastardize that particular childhood fantasy the telling of this story starts with jocelyn waiting for kait then when she comes out we get the front row seat of the happenings as we are there with her then we are back to the morning and kait leaves and jocelyn gets ready for christmas at her mothers and waits for her boyfriend we seen clues that last night might not have been a dream but then things change and answers arent connecting with what happened so maybe it all was after being out with the girls drinking and all im left curious as to what was real and what was a dream i think its all real but there is more than real and not here theres secrets i want to know what they are there is something special about jocelyn but im not sure what exactly it is ill definitely be reading the first novel kiss of death very soon to figure it all out this is definitely a romance with intimate moments,0.174,0.104,0.722,0.9877
NUM out of NUM stars why and how to forgive and reap the benefits october NUM NUM by becca chopra author of the chakra diaries big island of hawaii see all my reviews vine voice this review is from forgive to win end selfsabotage get everything you want paperback buddha said holding on to anger is like grasping a hot coal with the intent of throwing it at someone else you are the one who gets burned counseling a client with this wise saying she retorted forgiveness is not a concept in my religion well we are all holding on to anger of one sort or another against one parent or another an exlover exhusband exemployer etc and forgive to win finally explains fully and completely why its necessary and in our own best interests to let it go whatever your beliefs there are many wonderful books on forgiveness already on my bookshelf but i welcomed this one because it lays out a structured daily program its not just a philosophical treatise the steps suggested help train ourselves to let go of selfsabotaging behavior and learn to love ourselves by loving and forgiving others no matter what theyve done once our selfesteem is raised to the heights possible through selflove theres nothing you cant achieve forgiveness may seem like an easy task but its not and dr jacobson guides the reader in how to forgive his forgiveness diet is a unique set of recommendations to help us establish and maintain the NUM day commitment to acts of kindness and forgiveness that could literally change ones life he also includes forgiveness affirmations and visualizations that i have already used to great benefit looking at it from the vantage point of health anger throws us offbalance and creates tension that can lead to chronic pain and disease forgiveness opens the floodgates of our bodys own healing energy and keeps us grounded alert empowered and able to recognize opportunities for success the last chapter of the book is entitled getting everything you want if you dont already have it its time to get this book namaste becca chopra author of the chakra diaries URL,0.26,0.045,0.695,0.9979


### TF-IDF Features

In [0]:
%pip install scikit-learn

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# --- TF-IDF / CountVectorizer Feature Extraction using scikit-learn ---
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pandas as pd

# Step 1: Convert Spark DataFrame to Pandas 
train_pdf = train_df.select("review_text").sample(fraction=0.1, seed=60105179).toPandas()

print("Sample size:", len(train_pdf))

# Step 2: Initialize CountVectorizer
count_vect = CountVectorizer(
    max_features=5000,        # top N terms
    stop_words='english',     # remove filler words
    ngram_range=(1, 2)        # unigrams and bigrams
)

# Step 3: Fit and transform
X_counts = count_vect.fit_transform(train_pdf["review_text"])

# Step 4: Transform counts into TF-IDF
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Step 5: Convert to DataFrame
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=count_vect.get_feature_names_out()
)

print("TF-IDF matrix shape:", tfidf_df.shape)
tfidf_df.head(5)


Sample size: 671251
TF-IDF matrix shape: (671251, 5000)


Unnamed: 0,19th,1st,20th,2nd,3rd,aaron,abandoned,abby,aber,abilities,ability,able,abrupt,absolute,absolutely,absolutely love,absolutely loved,abuse,abused,abusive,academic,academy,accept,acceptance,accepted,accepting,access,accessible,accident,accidentally,accomplished,according,account,accounts,accurate,accused,achieve,act,acted,acting,...,year,year old,yearold,years,years ago,years later,years old,yes,yg,ykwn,yn,yo,york,york city,youd,youll,young,young adult,young girl,young man,young readers,young woman,younger,youre,youre going,youre looking,youth,youve,youve read,za,zach,ze,zero,zijn,zoe,zombie,zombies,zone,zu,zum
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.118481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125701,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### b. Sentiment Features

In [0]:
# Step 1 – Install required libraries
%pip install -q sentence-transformers


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# --- Transformer-based Embeddings (Sentence-BERT) ---


# Step 2 – Import the model
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Step 3 – Convert a safe sample of the training data to Pandas
train_sample = train_df.sample(fraction=0.1, seed=60105179).toPandas()

print("Sample size for embeddings:", len(train_sample))

# Step 4 – Load a pre-trained Sentence-BERT model
# (DistilBERT version is small and fast)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 5 – Generate embeddings for each review
embeddings = model.encode(train_sample["review_text"].tolist(), show_progress_bar=True)

# Step 6 – Store embeddings as a list of vectors
train_sample["bert_embedding"] = embeddings.tolist()

# Step 7 – Preview results
train_sample[["review_text", "bert_embedding"]].head(2)


Sample size for embeddings: 671251


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/20977 [00:00<?, ?it/s]

Unnamed: 0,review_text,bert_embedding
0,entertaining story but a bit of a mess when it...,"[-0.01206240151077509, 0.007737279869616032, -..."
1,emperors edge is the second book from lindsay ...,"[-0.04844294488430023, -0.10820366442203522, 0..."


### Additional features

To include additional features, i engineered simple metrics that capture writing style, tone, and behavior:

Average Word Length – measures writing complexity.

Unique Word Ratio – indicates vocabulary diversity.

Exclamation Count – reflects emotional intensity.

Contains URL – flags reviews mentioning links.

Sentiment Label – derived from compound sentiment score to classify overall polarity.

In [0]:
from pyspark.sql.functions import col, length, split, lower, regexp_replace, size, udf
from pyspark.sql.types import StringType, DoubleType

# --- 1 Average Word Length ---
train_df = train_df.withColumn(
    "avg_word_length",
    (length(col("review_text")) / (size(split(col("review_text"), " ")) + 1))
)

# --- 2️ Unique Word Ratio ---
train_df = train_df.withColumn(
    "unique_word_ratio",
    (size(split(lower(regexp_replace(col("review_text"), "[^a-zA-Z\\s]", "")), " ")) /
     (size(split(col("review_text"), " ")) + 1))
)

# --- 3️ Exclamation Count ---
train_df = train_df.withColumn(
    "exclamation_count",
    length(col("review_text")) - length(regexp_replace(col("review_text"), "!", ""))
)

# --- 4️ Contains URL Flag ---
train_df = train_df.withColumn(
    "contains_url",
    (col("review_text").rlike("http|www")).cast("int")
)

# --- 5️ Sentiment Label (from compound score) ---
def label_sentiment(c):
    if c is None:
        return None
    elif c >= 0.05:
        return "positive"
    elif c <= -0.05:
        return "negative"
    else:
        return "neutral"

label_udf = udf(label_sentiment, StringType())
train_df = train_df.withColumn("sentiment_label", label_udf(col("sentiment_compound")))

# --- Preview results ---
display(train_df.select(
    "review_text", 
    "avg_word_length", 
    "unique_word_ratio", 
    "exclamation_count", 
    "contains_url",
    "sentiment_label"
).limit(5))


review_text,avg_word_length,unique_word_ratio,exclamation_count,contains_url,sentiment_label
spoiler alert girl meets lion lion meets burglar burglar meets nasty end girl cleans up evidence the end,5.473684210526316,0.9473684210526316,0,0,negative
a little gruesome the robber gets munch out of existence if it doesnt got over your head but my son loved this book the drawings are fun and the rhyming story is cute,4.882352941176471,0.9705882352941176,0,0,positive
an important word to the church you can be an obstacle or you can be a nurturer of gifts and callings of young people called to ministry my full review is found here URL,4.828571428571428,0.9714285714285714,0,0,positive
jocelyn dreams of max almost every night but last night was different its christmas morning and last nights dreams weigh heavy on her mind as she waits for her visiting old college room mate to wake to talk to her first a ghost here crazy grandmothers ghost warning and wanting to help her then three visitors after that and not all were welcomed guests past present and future men collide in one night oh this novella is a very interesting and sexy vampire twist on a christmas carol oh the hints in this novella there is something special about jocelyn what im curious max is remembered and im curious how he manages to do this the current boyfriend that cute guy named chad from the dance in of course i tryyeah its him yum jocelyn has been trying to be stronger since max shes started selfdefense classes and fighting better along with becoming more physically fit than she ever has i have to say i love the description of the ghost yes its someone she knew shes the antithesis of the fairy godmother id always wished for all she needs now is a lit cigar between her fingers to totally bastardize that particular childhood fantasy the telling of this story starts with jocelyn waiting for kait then when she comes out we get the front row seat of the happenings as we are there with her then we are back to the morning and kait leaves and jocelyn gets ready for christmas at her mothers and waits for her boyfriend we seen clues that last night might not have been a dream but then things change and answers arent connecting with what happened so maybe it all was after being out with the girls drinking and all im left curious as to what was real and what was a dream i think its all real but there is more than real and not here theres secrets i want to know what they are there is something special about jocelyn but im not sure what exactly it is ill definitely be reading the first novel kiss of death very soon to figure it all out this is definitely a romance with intimate moments,5.340482573726542,0.9973190348525468,0,0,positive
NUM out of NUM stars why and how to forgive and reap the benefits october NUM NUM by becca chopra author of the chakra diaries big island of hawaii see all my reviews vine voice this review is from forgive to win end selfsabotage get everything you want paperback buddha said holding on to anger is like grasping a hot coal with the intent of throwing it at someone else you are the one who gets burned counseling a client with this wise saying she retorted forgiveness is not a concept in my religion well we are all holding on to anger of one sort or another against one parent or another an exlover exhusband exemployer etc and forgive to win finally explains fully and completely why its necessary and in our own best interests to let it go whatever your beliefs there are many wonderful books on forgiveness already on my bookshelf but i welcomed this one because it lays out a structured daily program its not just a philosophical treatise the steps suggested help train ourselves to let go of selfsabotaging behavior and learn to love ourselves by loving and forgiving others no matter what theyve done once our selfesteem is raised to the heights possible through selflove theres nothing you cant achieve forgiveness may seem like an easy task but its not and dr jacobson guides the reader in how to forgive his forgiveness diet is a unique set of recommendations to help us establish and maintain the NUM day commitment to acts of kindness and forgiveness that could literally change ones life he also includes forgiveness affirmations and visualizations that i have already used to great benefit looking at it from the vantage point of health anger throws us offbalance and creates tension that can lead to chronic pain and disease forgiveness opens the floodgates of our bodys own healing energy and keeps us grounded alert empowered and able to recognize opportunities for success the last chapter of the book is entitled getting everything you want if you dont already have it its time to get this book namaste becca chopra author of the chakra diaries URL,5.7154696132596685,0.9972375690607734,0,0,positive


# Combined Feature Set and Output


In [0]:
# =====================================================
# IV. Combined Feature Set and Output (Optimized Version)
# =====================================================

from pyspark.sql.functions import col
from pyspark.sql import SparkSession

# Step 1: Select key metadata columns
metadata_df = train_df.select("review_id", "book_id", "rating")

# Step 2: Select engineered numeric features
numeric_features_df = train_df.select(
    "review_id",
    "review_length_words",
    "review_length_chars",
    "sentiment_pos",
    "sentiment_neg",
    "sentiment_neu",
    "sentiment_compound",
    "avg_word_length",
    "unique_word_ratio",
    "exclamation_count",
    "contains_url"
)

# Step 3: Convert TF-IDF (Pandas → Spark)
# Make sure tfidf_df includes a review_id column
tfidf_spark_df = spark.createDataFrame(tfidf_df)

# Step 4: Convert BERT embeddings (Pandas → Spark)
bert_spark_df = spark.createDataFrame(train_sample[["review_id", "bert_embedding"]])

# Step 5: Combine all features on review_id (inner joins)
combined_df = (
    metadata_df
    .join(numeric_features_df, "review_id", "inner")
    .join(tfidf_spark_df, "review_id", "inner")
    .join(bert_spark_df, "review_id", "inner")
)

# Step 6: Save to Gold layer safely
output_base = "abfss://lakehouse@goodreadsreviews60105179.dfs.core.windows.net/gold/features_v2"

combined_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(f"{output_base}/train_final")

print("✅ Combined feature dataset saved successfully to features_v2/train_final")

# Step 7: Lightweight sanity check (NO full count)
print(f"Number of columns: {len(combined_df.columns)}")
print("Sample records:")
display(combined_df.limit(5))


[0;31m---------------------------------------------------------------------------[0m
[0;31mThe Python process exited with exit code 137 (SIGKILL: Killed). This may have been caused by an OOM error. Check your command's memory usage.[0m
[0;31m[0m
[0;31m[0m
[0;31m[0m
[0;31mThe last 10 KB of the process's stderr and stdout can be found below. See driver logs for full logs.[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mLast messages on stderr:[0m
[0;31mWed Nov 12 21:32:15 2025 Connection to spark from PID  3015[0m
[0;31mWed Nov 12 21:32:16 2025 Initialized gateway on port 38051[0m
[0;31m  """The sequence number of this run attempt for a triggered job run. The initial attempt of a run[0m
[0;31m  """The sequence number of this run attempt for a triggered job run. The initial attempt of a run[0m
[0;31m  """The sequence number of this run attempt for a triggered job run. The initial attempt of a run[0m
[0;31mWed Nov 12 