In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col, exp, lit
from pyspark.sql.types import IntegerType


In [6]:
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

data_path = '../data/ml-100k/u.data'
columns = ["userId", "movieId", "rating", "timestamp"]
ratings = spark.read.csv(data_path, sep="\t", inferSchema=True).toDF(*columns)

ratings = ratings.withColumn("userId", col("userId").cast(IntegerType()))
ratings = ratings.withColumn("movieId", col("movieId").cast(IntegerType()))

max_timestamp = ratings.agg({'timestamp': 'max'}).collect()[0][0]
ratings = ratings.withColumn('weight', exp(-(lit(max_timestamp) - col('timestamp')) / lit(10**6)))

In [7]:
user_activity = ratings.groupBy("userId").count()
user_activity.orderBy("count", ascending=False).show(5)

movie_popularity = ratings.groupBy("movieId").count()
movie_popularity.orderBy("count", ascending=False).show(5)


active_users = user_activity.filter(col("count") >= 5).select("userId")
filtered_ratings = ratings.join(active_users, on="userId", how="inner")

popular_movies = movie_popularity.filter(col("count") >= 5).select("movieId")
filtered_ratings = filtered_ratings.join(popular_movies, on="movieId", how="inner")


print(f"Количество записей после фильтрации: {filtered_ratings.count()}")
filtered_ratings.show(5)



+------+-----+
|userId|count|
+------+-----+
|   405|  737|
|   655|  685|
|    13|  636|
|   450|  540|
|   276|  518|
+------+-----+
only showing top 5 rows

+-------+-----+
|movieId|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
+-------+-----+
only showing top 5 rows

Количество записей после фильтрации: 99287
+-------+------+------+---------+--------------------+
|movieId|userId|rating|timestamp|              weight|
+-------+------+------+---------+--------------------+
|    242|   196|     3|881250949|5.928798377342577E-6|
|    302|   186|     3|891717742| 0.20827499106941064|
|    377|    22|     1|878887116|5.576568655535087E-7|
|     51|   244|     2|880606923|3.113649646606838E-6|
|    346|   166|     1|886397596|0.001018889469706...|
+-------+------+------+---------+--------------------+
only showing top 5 rows



In [8]:
avg_ratings = filtered_ratings.groupBy("userId").avg("rating").withColumnRenamed("avg(rating)", "avg_user_rating")
normalized_ratings = filtered_ratings.join(avg_ratings, on="userId")
normalized_ratings = normalized_ratings.withColumn("normalized_rating", col("rating") - col("avg_user_rating"))


In [13]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Split data into training and testing sets
(train_data, test_data) = normalized_ratings.randomSplit([0.8, 0.2], seed=42)

# Build ALS model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

# Define parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.maxIter, [10, 15]) \
    .addGrid(als.regParam, [0.01, 0.1, 0.5]) \
    .build()

# Define evaluator
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

# Set up cross-validation
crossval = CrossValidator(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

cv_model = crossval.fit(train_data)
best_model = cv_model.bestModel

# Print best hyperparameters
print(f"Лучший ранг: {best_model.rank}")
print(f"Лучший параметр регуляризации: {best_model._java_obj.parent().getRegParam()}")
print(f"Лучшее число итераций: {best_model._java_obj.parent().getMaxIter()}")

predictions = best_model.transform(test_data)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error: {rmse}")


24/11/25 17:34:00 WARN CacheManager: Asked to cache already cached data.
24/11/25 17:34:00 WARN CacheManager: Asked to cache already cached data.


Лучший ранг: 30
Лучший параметр регуляризации: 0.1
Лучшее число итераций: 15
Root-mean-square error: 0.9152761522573798


In [14]:
# Generate top recommendations for each user
user_recs = best_model.recommendForAllUsers(10)
user_recs.show(5, truncate=False)

# Generate top recommendations for each movie
movie_recs = best_model.recommendForAllItems(10)
movie_recs.show(5, truncate=False)

                                                                                

+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                  |
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{1449, 4.967152}, {169, 4.859849}, {1142, 4.8272376}, {408, 4.8224764}, {694, 4.772437}, {127, 4.751755}, {647, 4.7067604}, {50, 4.6776357}, {963, 4.677509}, {114, 4.6643744}] |
|3     |[{320, 4.569999}, {902, 4.5201254}, {1143, 4.1284337}, {205, 4.123393}, {346, 4.066987}, {50, 4.0470595}, {430, 3.9978406}, {172, 3.995446}, {347, 3.9828742}, {340, 3.9738388}] |
|5     |[{169, 4.599252}, {408, 4.5115423}, {968, 4.35653}, {430,



+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                 |
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1      |[{357, 5.046687}, {688, 5.042293}, {849, 5.017312}, {850, 4.9840903}, {810, 4.9291167}, {324, 4.8674173}, {939, 4.85382}, {173, 4.844843}, {16, 4.822598}, {477, 4.7336397}]    |
|3      |[{472, 4.249589}, {332, 4.2368336}, {887, 4.2352324}, {628, 4.2272534}, {388, 4.222994}, {849, 4.1835136}, {372, 4.1828985}, {863, 4.164849}, {348, 4.154648}, {130, 4.1510344}]|
|5      |[{688, 4.715366}, {849, 4.562317}, {462, 4.546496}, {907

                                                                                