## Connect to ADLS Gen2 (Storage Account)

In [0]:
# Connect to ADLS Gen2 (Storage Account)
spark.conf.set(
"fs.azure.account.key.goodreadsreviews60300294.dfs.core.windows.net",
"SdrUSgCnzVYmEhQn9mzu3HtSdzHfZLLnQ+2ofOm7fq4GktiUUs3bZw7qJoD8BXFqtyfzCkDbfKZI+ASt5tp6qQ=="
)


## Load the curated Gold dataset from previous Lab (Lab3)

In [0]:
gold_curated_path = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/curated_reviews/"

df = spark.read.format("delta").load(gold_curated_path)

print("Total rows in curated_reviews:", df.count())
df.printSchema()
df.show(5, truncate=False)

In [0]:
# import necessary libraries
from pyspark.sql.functions import col, length, udf, split, size
from pyspark.sql.types import StringType
import re

## Splitting the dataset 

In [0]:
# Split curated dataset into (train / val / test)

# Create 3 splits
train_df, val_df, test_df = df.randomSplit([0.7, 0.15, 0.15], seed=42)

print("Train rows:", train_df.count())
print("Val rows:  ", val_df.count())
print("Test rows: ", test_df.count())

# Base path for features_v2
base_path = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/features_v2/"

train_path = base_path + "train"
val_path   = base_path + "val"
test_path  = base_path + "test"

# Save each split as Delta
(train_df.write
    .format("delta")
    .mode("overwrite")
    .save(train_path))

(val_df.write
    .format("delta")
    .mode("overwrite")
    .save(val_path))

(test_df.write
    .format("delta")
    .mode("overwrite")
    .save(test_path))

print("✅ Saved train/val/test splits under features_v2/")

## fast check on the files are there 
#### not mentioned in the lab, but i have do it 

In [0]:
display(dbutils.fs.ls(base_path))

## Load the TRAIN Split from the Gold Layer

In [0]:
# 1) Load TRAIN split from features_v2
base_path = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/features_v2/"
train_path = base_path + "train"

train_df = spark.read.format("delta").load(train_path)

print("Original TRAIN rows:", train_df.count())
train_df.select("review_id", "review_text").show(5, truncate=150)


## Function for clean text

In [0]:
# 2) Define a Python function to clean text
def clean_text(text):
    if text is None:
        return None
    
    # to lowercase
    t = text.lower()
    
    # replace URLs with placeholder
    t = re.sub(r"http\S+|www\.\S+", " url ", t)
    
    # replace numbers with placeholder
    t = re.sub(r"\d+", " number ", t)
    
    # remove emojis / non-ASCII characters
    t = re.sub(r"[^\x00-\x7F]+", " ", t)
    
    # remove punctuation (keep only letters and spaces)
    t = re.sub(r"[^a-z\s]", " ", t)
    
    # collapse multiple spaces -> single space
    t = re.sub(r"\s+", " ", t)
    
    # trim spaces
    t = t.strip()
    
    return t

## Apply Text Cleaning and Normalize Review Text

In [0]:
# 3) Register as UDF
clean_text_udf = udf(clean_text, StringType())

# 4) Apply cleaning to review_text
train_clean = (
    train_df
    .withColumn("review_text_clean", clean_text_udf(col("review_text")))
)

# 5) Drop very short or empty reviews (< 10 characters)
train_clean = train_clean.filter(
    (col("review_text_clean").isNotNull()) &
    (length(col("review_text_clean")) >= 10)
)

print("Cleaned TRAIN rows:", train_clean.count())

# 6) Quick preview of cleaned text
train_clean.select("review_id", "review_text", "review_text_clean") \
           .show(10, truncate=50)

## Extract Basic Text Features

In [0]:
# STEP 5A: Add basic text features to train_clean

# 1) Number of characters
train_basic = train_clean.withColumn(
    "review_length_chars", length(col("review_text_clean"))
)

# 2) Number of words
train_basic = train_basic.withColumn(
    "review_length_words", size(split(col("review_text_clean"), " "))
)

print("Rows after adding basic features:", train_basic.count())

train_basic.select(
    "review_id",
    "review_text_clean",
    "review_length_chars",
    "review_length_words"
).show(10, truncate=80)


## Installing NLTK + VADER

In [0]:
%pip install nltk

import nltk
nltk.download("vader_lexicon")

## Sentiment features

In [0]:
# STEP 5B: Sentiment features using NLTK VADER

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType, StructField, DoubleType

# 1) Create VADER analyzer (on driver)
analyzer = SentimentIntensityAnalyzer()

# 2) Define a function that returns a struct with 4 scores
def get_sentiment_scores(text):
    if text is None:
        return (0.0, 0.0, 0.0, 0.0)
    scores = analyzer.polarity_scores(text)
    return (
        float(scores["pos"]),
        float(scores["neg"]),
        float(scores["neu"]),
        float(scores["compound"])
    )

# 3) Define schema for the struct
sentiment_schema = StructType([
    StructField("pos", DoubleType(), nullable=False),
    StructField("neg", DoubleType(), nullable=False),
    StructField("neu", DoubleType(), nullable=False),
    StructField("compound", DoubleType(), nullable=False),
])

# 4) Register UDF
sentiment_udf = udf(get_sentiment_scores, sentiment_schema)

# 5) Apply to the cleaned text column
train_sent = (
    train_basic
    .withColumn("sentiment", sentiment_udf(col("review_text_clean")))
    .withColumn("sentiment_pos", col("sentiment.pos"))
    .withColumn("sentiment_neg", col("sentiment.neg"))
    .withColumn("sentiment_neu", col("sentiment.neu"))
    .withColumn("sentiment_compound", col("sentiment.compound"))
    .drop("sentiment")  # drop struct after expanding
)

print("Rows after adding sentiment features:", train_sent.count())

train_sent.select(
    "review_id",
    "review_text_clean",
    "sentiment_pos",
    "sentiment_neg",
    "sentiment_neu",
    "sentiment_compound"
).show(10, truncate=80)


## install libraries

In [0]:
# install libraries
%pip install scikit-learn

## TF-IDF features

In [0]:
# STEP 5C: TF-IDF features on review_text_clean (TRAIN)

from pyspark.sql.functions import monotonically_increasing_id, col
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 1) Start from train_sent (has cleaned text + basic + sentiment features)
#    If your last variable name is different, adjust here.
train_for_tfidf = train_sent.withColumn("row_id", monotonically_increasing_id())

# OPTIONAL: if dataset is too large, uncomment this line to limit rows for TF-IDF
# train_for_tfidf = train_for_tfidf.limit(50000)

# 2) Move the needed columns to Pandas
pdf = train_for_tfidf.select("row_id", "review_text_clean").limit(100).toPandas()

# 3) Prepare TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=100,       # top 5000 terms
    ngram_range=(1, 2),      # unigrams + bigrams
    stop_words="english"     # remove common English stopwords
)

# Replace NaN with empty string just in case
texts = pdf["review_text_clean"].fillna("")

# 4) Fit TF-IDF on TRAIN text
X = tfidf.fit_transform(texts)

# 5) Build a Pandas DataFrame with TF-IDF features
feature_names = tfidf.get_feature_names_out()
tfidf_cols = [f"tfidf_{w}" for w in feature_names]

tfidf_pdf = pd.DataFrame(X.toarray(), columns=tfidf_cols)
tfidf_pdf["row_id"] = pdf["row_id"].values

# 6) Convert TF-IDF features back to Spark
tfidf_spark = spark.createDataFrame(tfidf_pdf)

# 7) Join TF-IDF features back to the original TRAIN dataframe
train_tfidf = train_for_tfidf.join(tfidf_spark, on="row_id", how="inner")

print("Rows in train_tfidf:", train_tfidf.count())
print("Number of TF-IDF feature columns:", len(tfidf_cols))

# Show a sample with a few TF-IDF columns
sample_cols = ["review_id", "review_text_clean"] + tfidf_cols[:10]

train_tfidf.select(sample_cols).show(5, truncate=80)


## install libraries

In [0]:
%pip install sentence-transformers

## Sentence-BERT Embeddings

In [0]:
# STEP 5D: Sentence-BERT Embeddings

from sentence_transformers import SentenceTransformer
from pyspark.sql.functions import col, monotonically_increasing_id
import pandas as pd
import numpy as np

# 1) Load the model once (on driver)
model = SentenceTransformer('all-MiniLM-L6-v2')   # 384-dimensional embeddings

# 2) Add row_id (if not present) for joining later
train_embed = train_tfidf.withColumn("row_id", monotonically_increasing_id())

# 3) Select only what we need
pdf = train_embed.select("row_id", "review_text_clean").limit(100).toPandas()

# Replace any None with " " to avoid errors
pdf["review_text_clean"] = pdf["review_text_clean"].fillna(" ")

# 4) Encode in batches to avoid GPU/CPU memory issues
batch_size = 256
embeddings_list = []

for i in range(0, len(pdf), batch_size):
    batch_texts = pdf["review_text_clean"].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts)
    embeddings_list.append(batch_embeddings)

# Stack all batches together
embeddings = np.vstack(embeddings_list)

# 5) Build Pandas DataFrame
embed_cols = [f"embed_{i}" for i in range(embeddings.shape[1])]
embed_pdf = pd.DataFrame(embeddings, columns=embed_cols)

# Add row_id back
embed_pdf["row_id"] = pdf["row_id"].values

# 6) Convert back to Spark
embed_spark = spark.createDataFrame(embed_pdf)

# 7) Join embeddings with full training dataset
train_final = train_embed.join(embed_spark, on="row_id", how="inner")

print("Rows in final train dataset:", train_final.count())
print("Embedding columns:", len(embed_cols))


## Sanitize Column Names for Delta Compatibility

In [0]:
# Start from your final training dataframe
clean_train_final = train_final

# Function to make column names Delta-safe
def sanitize_col(name):
    # remove leading/trailing spaces
    name = name.strip()
    # replace invalid characters with underscore
    name = re.sub(r"[ ,;{}()\n\t=]", "_", name)
    # collapse multiple underscores
    name = re.sub(r"_+", "_", name)
    return name

# Apply to all columns
for old_name in clean_train_final.columns:
    new_name = sanitize_col(old_name)
    if new_name != old_name:
        clean_train_final = clean_train_final.withColumnRenamed(old_name, new_name)

# Optional: quickly inspect some of the TF-IDF columns
print("Sample columns after sanitizing:")
print([c for c in clean_train_final.columns if c.startswith("tfidf_")][:20])


## Save final TRAIN features to Gold layer

In [0]:
# STEP 6 & 7: Save final TRAIN features to Gold layer

features_train_path = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/features_v2/train_features/"

(clean_train_final
 .write
 .format("delta")
 .mode("overwrite")
 .save(features_train_path))

print("✅ Saved final training features to:")
print(features_train_path)


## Register final TRAIN features as a managed Delta table

In [0]:
features_train_path = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/features_v2/train_features/"

# 1) Read the Delta files from ADLS
df_features = spark.read.format("delta").load(features_train_path)

# 2) Save as a managed table in the 'default' schema
spark.sql("USE SCHEMA default")
spark.sql("DROP TABLE IF EXISTS goodreads_train_features")

df_features.write.format("delta").mode("overwrite").saveAsTable("default.goodreads_train_features")

# 3) Quick checks
spark.sql("SELECT COUNT(*) AS rows FROM default.goodreads_train_features").show()

spark.sql("""
  SELECT 
    review_id,
    rating,
    review_length_words,
    sentiment_compound
  FROM default.goodreads_train_features
  ORDER BY review_length_words DESC
  LIMIT 10
""").show(truncate=False)


## save it in csv file

In [0]:
csv_path = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/features_v2_sample/"

df_features.coalesce(1).write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv(csv_path)

print("Saved CSV to:", csv_path)


## rename the csv file

In [0]:
old_file = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/features_v2_sample/part-00000-tid-8958894957276436688-1f9f5bc5-e9cb-4b94-a958-42770b6faf3c-3304-1-c000.csv"
new_file = "abfss://lakehouse@goodreadsreviews60300294.dfs.core.windows.net/gold/features_v2_sample/data_lab4.csv"

dbutils.fs.mv(old_file, new_file)


## check the folder what it have inside

In [0]:
display(dbutils.fs.ls(csv_path))