In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=ff45e3c7f2160cddce248603f4ab5ea7e97861b2ec44ff1f2504eb0c05e5fc2f
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("BookRecommendationALS") \
    .getOrCreate()

data = spark.read.csv('book_ratings.csv', header=True, inferSchema=True)

data.printSchema()
data.show(5)

# Define ALS model
als = ALS(
    maxIter=10,  # Number of iterations
    userCol="user_id",  # Column for user ID
    itemCol="book_id",  # Column for item (book) ID
    ratingCol="rating",  # Column for ratings
    coldStartStrategy="drop"  # Drop rows with NaN predictions
)

# Fit the ALS model
model = als.fit(data)

# Predict ratings using the trained model
predictions = model.transform(data)

# Define RegressionEvaluator
evaluator = RegressionEvaluator(
    metricName="rmse",  # Root Mean Squared Error
    labelCol="rating",  # Column for the actual ratings
    predictionCol="prediction"  # Column for predicted ratings
)

# Evaluate the model
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Show predictions for User ID = 53
user_id = 53
user_predictions = predictions.filter(col("user_id") == user_id)
user_predictions = user_predictions.select("book_id", "user_id", "rating", "prediction").orderBy(col("prediction").desc())
user_predictions.show(truncate=False)

# Show 5 recommended books for all users
user_recommendations = model.recommendForAllUsers(5)
user_recommendations.show(truncate=False)

# Show 5 recommended users for all books
item_recommendations = model.recommendForAllItems(5)
item_recommendations.show(truncate=False)

# Stop SparkSession
spark.stop()


root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows

Root Mean Squared Error (RMSE): 0.5959071011688031
+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|8946   |53     |5     |4.286709  |
|8882   |53     |2     |2.0793421 |
|8336   |53     |1     |1.2187043 |
|8336   |53     |1     |1.2187043 |
+-------+-------+------+----------+

+-------+-----------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                |
+-------+----------------------------------------------------------------