In [0]:
# Define your project structure names
catalog_name = "big_data_project"
silver_schema_name = "silver"
gold_schema_name = "gold"

# Define the full table names we will use
silver_table = f"{catalog_name}.{silver_schema_name}.products_cleaned"
gold_table = f"{catalog_name}.{gold_schema_name}.products_with_features"

# Load the cleaned data from the silver table
silver_df = spark.table(silver_table)

print(f"Successfully loaded Silver table: {silver_table}")
display(silver_df.limit(5))

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

# 1. Tokenizer: Split "searchable_text" into "words"
tokenizer = Tokenizer(inputCol="searchable_text", outputCol="words")

# 2. StopWordsRemover: Take "words" and create "filtered_words"
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# 3. HashingTF: Converts the words into a sparse vector of term frequencies (counts).
# We set a large number of features, but it only stores the ones it finds.
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=20000)

# 4. IDF: This is our ML model. It weighs the counts,
# making rare (and more important) words have a higher score.
# This model is very small.
idf = IDF(inputCol="rawFeatures", outputCol="features") # 'features' is our final vector

# 5. Assemble all steps into the new pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashingTF, idf])

print("MLlib pipeline (using TF-IDF) created successfully.")

In [0]:
# Fit the pipeline to the silver data to train the model
pipeline_model = pipeline.fit(silver_df)

# Transform the data to add the new 'features' column
vectorized_data_df = pipeline_model.transform(silver_df)

# Select only the columns we need for the final gold table
# The 'features' column is our new model output!
gold_df = vectorized_data_df.select(
    "asin",
    "title",
    "productURL",
    "imgUrl",
    "price",
    "stars",
    "category_name",
    "features"  # This is the all-important ML vector
)

print("Model training complete. Data transformed with feature vectors.")
display(gold_df.limit(5))

In [0]:
# Save the final table with ML features to the gold database
gold_df.write.format("delta").mode("overwrite").saveAsTable(gold_table)

print(f"Successfully created Gold table: {gold_table}")