In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Load training data
df = spark.table("workspace.gold.fact_ratings").select("userId", "movieId", "rating")

# Cache for performance
#df.cache() # --> commented because doesn't work in databricks free specifically
print(f"Training data: {df.count()} rows")

# Train/test split
(training, test) = df.randomSplit([0.8, 0.2], seed=42)

# Build ALS model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    maxIter=10,
    regParam=0.1,
    rank=10,
    coldStartStrategy="drop",  
    nonnegative=True
)

model = als.fit(training)
print("✅ Model trained")

# Save to a Unity Catalog Volume
model.write().overwrite().save("/Volumes/workspace/default/movielens_raw/models/als_model")
print("✅ Model saved")

# Evaluate
predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"📊 RMSE on test set: {rmse:.4f}")

# Unpersist cached data 
#df.unpersist() # --> commented because doesn't work in databricks free specifically


Training data: 32000204 rows
✅ Model trained
✅ Model saved
📊 RMSE on test set: 0.8089


In [0]:
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, row_number
from pyspark.sql import Window

# Pick top 100 power users (or any sample)
sample_users = (
    spark.table("workspace.gold.dim_users")
    .filter("is_power_user = true")
    .orderBy(col("total_ratings").desc())
    .select("userId")
    .limit(100)
)

# Get all movie IDs
all_movies = spark.table("workspace.gold.fact_ratings").select("movieId").distinct()

# Create all user-movie pairs for the sample
user_movie_pairs = sample_users.crossJoin(all_movies)

# Predict
model = ALSModel.load("/Volumes/workspace/default/movielens_raw/models/als_model")
print("✅ Model loaded")
predictions = model.transform(user_movie_pairs).filter(col("prediction").isNotNull())

# Top 10 per user
window = Window.partitionBy("userId").orderBy(col("prediction").desc())

user_recs = (
    predictions
    .withColumn("rank", row_number().over(window))
    .filter(col("rank") <= 10)
    .drop("rank")
)

user_recs.write.format("delta").mode("overwrite").saveAsTable("workspace.gold.user_recommendations")
print(f"✅ Recommendations for {sample_users.count()} users")

# Show sample with movie titles
print("\n--- Top 10 for first user ---")
first_user = user_recs.select("userId").first()[0]
(
    user_recs
    .filter(col("userId") == first_user)
    .join(spark.table("workspace.gold.dim_movies_enriched").select("movieId", "title"), on="movieId")
    .orderBy(col("prediction").desc())
    .show(10, truncate=False)
)

✅ Model loaded
✅ Recommendations for 100 users

--- Top 10 for first user ---
+-------+------+----------+-------------------------------------------------------------------------------------+
|movieId|userId|prediction|title                                                                                |
+-------+------+----------+-------------------------------------------------------------------------------------+
|240070 |5029  |5.7764626 |SpongeBob SquarePants: Heroes of Bikini Bottom (2011)                                |
|240054 |5029  |5.7764626 |SpongeBob SquarePants: Tide and Seek                                                 |
|270306 |5029  |5.7764626 |WWE: The Triumph and Tragedy of World Class Championship Wrestling (2007)            |
|177209 |5029  |5.63747   |Acı Aşk (2009)                                                                       |
|177209 |5029  |5.63747   |Acı Aşk (2009)                                                                       |
|194434 |5