### Recommender System

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("Homework_12.1").getOrCreate()

In [4]:
data = spark.read.csv('../Datasets/ratings.csv', inferSchema=True, header=True)

In [5]:
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [6]:
data.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [7]:
training, test = data.randomSplit([0.8, 0.2], seed=42)

In [11]:
# Use coldStartStrategy="drop" in ALS
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy="drop")

In [12]:
model = als.fit(training)

In [15]:
predictions = model.transform(test)
# Remove NaN predictions before computing RMSE
predictions = predictions.na.drop()

In [16]:
# Evaluate RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Updated RMSE: {rmse:.6f}")

Updated RMSE: 1.088378


In [17]:
# Find the most recommended movie for User (2221610070)
id = 70

In [18]:
user_recommendations = model.recommendForAllUsers(10)

In [19]:
recommended_movies = user_recommendations.filter(user_recommendations.userId == id).select("recommendations").collect()

In [20]:
if recommended_movies:
    most_recommended_movie = recommended_movies[0]["recommendations"][0]["movieId"]
    print(f"Most Recommended Movie ID for User {id}: {most_recommended_movie}")
else:
    print(f"No recommendations found for User {id}")

Most Recommended Movie ID for User 70: 381
