In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, exp, lit, expr, array, size, when, array_intersect
from pyspark.sql.types import IntegerType


In [3]:
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

data_path = '../data/ml-100k/u.data'
columns = ["userId", "movieId", "rating", "timestamp"]
ratings = spark.read.csv(data_path, sep="\t", inferSchema=True).toDF(*columns)


ratings = ratings.withColumn("userId", col("userId").cast(IntegerType()))
ratings = ratings.withColumn("movieId", col("movieId").cast(IntegerType()))

max_timestamp = ratings.agg({'timestamp': 'max'}).collect()[0][0]
ratings = ratings.withColumn('weight', exp(-(lit(max_timestamp) - col('timestamp')) / lit(10**6)))

24/11/26 19:08:54 WARN Utils: Your hostname, MacBook-Air-Nasdorm.local resolves to a loopback address: 127.0.0.1; using 192.168.186.71 instead (on interface en0)
24/11/26 19:08:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/26 19:08:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Пути к данным
movies_path = '../data/ml-100k/u.item'

# Определяем имена колонок
movies_columns = [
    "movieId", "title", "release_date", "video_release_date", "imdb_url",
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Чтение данных
movies = spark.read.csv(movies_path, sep="|", inferSchema=True).toDF(*movies_columns)

# Убедимся, что данные загружены корректно
movies.show(5)


+-------+-----------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|movieId|            title|release_date|video_release_date|            imdb_url|unknown|Action|Adventure|Animation|Children's|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+-------+-----------------+------------+------------------+--------------------+-------+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+
|      1| Toy Story (1995)| 01-Jan-1995|              NULL|http://us.imdb.co...|      0|     0|        0|        1|         1|     1|    0|          0|    0|      0|        0|     0|      0|      0|      0|     0|       0|  0|      0|
|      2| GoldenEye (1995)| 01-Jan-1995|              NULL|h

In [5]:
user_activity = ratings.groupBy("userId").count()
user_activity.orderBy("count", ascending=False).show(5)

movie_popularity = ratings.groupBy("movieId").count()
movie_popularity.orderBy("count", ascending=False).show(5)


active_users = user_activity.filter(col("count") >= 5).select("userId")
filtered_ratings = ratings.join(active_users, on="userId", how="inner")

popular_movies = movie_popularity.filter(col("count") >= 5).select("movieId")
filtered_ratings = filtered_ratings.join(popular_movies, on="movieId", how="inner")


print(f"Количество записей после фильтрации: {filtered_ratings.count()}")
filtered_ratings.show(5)



+------+-----+
|userId|count|
+------+-----+
|   405|  737|
|   655|  685|
|    13|  636|
|   450|  540|
|   276|  518|
+------+-----+
only showing top 5 rows

+-------+-----+
|movieId|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
+-------+-----+
only showing top 5 rows

Количество записей после фильтрации: 99287
+-------+------+------+---------+--------------------+
|movieId|userId|rating|timestamp|              weight|
+-------+------+------+---------+--------------------+
|    242|   196|     3|881250949|5.928798377342577E-6|
|    302|   186|     3|891717742| 0.20827499106941064|
|    377|    22|     1|878887116|5.576568655535087E-7|
|     51|   244|     2|880606923|3.113649646606838E-6|
|    346|   166|     1|886397596|0.001018889469706...|
+-------+------+------+---------+--------------------+
only showing top 5 rows



In [6]:
avg_ratings = filtered_ratings.groupBy("userId").avg("rating").withColumnRenamed("avg(rating)", "avg_user_rating")
normalized_ratings = filtered_ratings.join(avg_ratings, on="userId")
normalized_ratings = normalized_ratings.withColumn("normalized_rating", col("rating") - col("avg_user_rating"))


In [7]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

# Split data into training and testing sets
(train_data, test_data) = normalized_ratings.randomSplit([0.8, 0.2], seed=42)

# Build ALS model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

# Define parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.maxIter, [10, 15]) \
    .addGrid(als.regParam, [0.01, 0.1, 0.5]) \
    .build()

# Define evaluator
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

# Set up cross-validation
crossval = CrossValidator(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

cv_model = crossval.fit(train_data)
best_model = cv_model.bestModel

# Print best hyperparameters
print(f"Лучший ранг: {best_model.rank}")
print(f"Лучший параметр регуляризации: {best_model._java_obj.parent().getRegParam()}")
print(f"Лучшее число итераций: {best_model._java_obj.parent().getMaxIter()}")

predictions = best_model.transform(test_data)

rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error: {rmse}")


24/11/26 19:09:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/11/26 19:09:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/11/26 19:09:02 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/11/26 19:09:06 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


Лучший ранг: 30
Лучший параметр регуляризации: 0.1
Лучшее число итераций: 15
Root-mean-square error: 0.9138607026034773


In [8]:
# Объединение предсказаний ALS с данными о фильмах
predictions_with_content = best_model.transform(normalized_ratings).join(movies, on="movieId", how="left")

# Добавим популярность фильмов
movie_popularity = ratings.groupBy("movieId").count().withColumnRenamed("count", "popularity")
predictions_with_content = predictions_with_content.join(movie_popularity, on="movieId", how="left")

# Создаем финальный набор данных
feature_columns = [
    "prediction", "popularity", "Action", "Adventure", "Animation", "Children's", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
    "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Собираем все признаки в вектор
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
final_data = assembler.transform(predictions_with_content)

# Проверяем финальную структуру
final_data.select("features", "rating").show(5, truncate=False)


24/11/26 19:10:16 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------------------------------------------------+------+
|features                                                      |rating|
+--------------------------------------------------------------+------+
|(20,[0,1,2,7,9],[3.377946615219116,413.0,1.0,1.0,1.0])        |1     |
|(20,[0,1,6,7],[4.271738052368164,241.0,1.0,1.0])              |5     |
|(20,[0,1,2,3,16,17],[3.540034055709839,151.0,1.0,1.0,1.0,1.0])|4     |
|(20,[0,1,2,3,6,15],[4.324306964874268,324.0,1.0,1.0,1.0,1.0]) |5     |
|(20,[0,1,4,6],[3.9470107555389404,66.0,1.0,1.0])              |4     |
+--------------------------------------------------------------+------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import col
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Преобразуем данные в pandas
final_data_pd = final_data.select("features", "rating").toPandas()

# Разделяем на X и y
X = np.vstack(final_data_pd["features"].values)
y = final_data_pd["rating"].values

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение LightGBM
model = LGBMRegressor()
model.fit(X_train, y_train)

# Предсказания и оценка
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE для гибридной модели: {rmse}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003864 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 528
[LightGBM] [Info] Number of data points in the train set: 79429, number of used features: 20
[LightGBM] [Info] Start training from score 3.534893
RMSE для гибридной модели: 0.666701203264879


In [10]:
# Тестовые данные для конкретного пользователя
test_user = 10
user_data = final_data.filter(col("userId") == test_user).select("features", "movieId")

# Предсказания гибридной модели
user_data_pd = user_data.toPandas()
X_user = np.vstack(user_data_pd["features"].values)
predicted_ratings = model.predict(X_user)

# Сортировка фильмов по предсказанному рейтингу
recommendations = user_data_pd.copy()
recommendations["predicted_rating"] = predicted_ratings
recommendations = recommendations.sort_values(by="predicted_rating", ascending=False)

print("Топ-10 фильмов для пользователя:")
print(recommendations.head(10))


Топ-10 фильмов для пользователя:
                                              features  movieId  \
78   (4.836489677429199, 243.0, 0.0, 0.0, 0.0, 0.0,...      483   
81   (4.72000789642334, 413.0, 1.0, 0.0, 0.0, 0.0, ...      127   
29   (4.753217697143555, 283.0, 0.0, 0.0, 0.0, 0.0,...       64   
94   (4.767781734466553, 264.0, 0.0, 0.0, 0.0, 0.0,...      357   
5    (4.60173225402832, 508.0, 0.0, 0.0, 0.0, 0.0, ...      100   
117  (4.664138317108154, 125.0, 0.0, 0.0, 0.0, 0.0,...      178   
108  (4.6776580810546875, 179.0, 0.0, 0.0, 0.0, 0.0...      480   
83   (4.628182411193848, 390.0, 0.0, 0.0, 0.0, 0.0,...       98   
120  (4.716010570526123, 198.0, 0.0, 0.0, 0.0, 0.0,...      134   
40   (4.65696907043457, 152.0, 1.0, 1.0, 0.0, 0.0, ...      498   

     predicted_rating  
78           4.923915  
81           4.914065  
29           4.903428  
94           4.892967  
5            4.884192  
117          4.874994  
108          4.866599  
83           4.865920  
120          

In [11]:
# Generate top recommendations for each user
user_recs = best_model.recommendForAllUsers(10)
user_recs.show(5, truncate=False)

# Generate top recommendations for each movie
movie_recs = best_model.recommendForAllItems(10)
movie_recs.show(5, truncate=False)

                                                                                

+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                  |
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{169, 4.942462}, {408, 4.8926034}, {1449, 4.823004}, {50, 4.7937336}, {56, 4.7381635}, {1142, 4.726892}, {89, 4.7018657}, {285, 4.691173}, {127, 4.68933}, {963, 4.6632724}]    |
|3     |[{320, 4.490426}, {1143, 4.1743}, {902, 4.171051}, {634, 4.093288}, {347, 4.061003}, {180, 4.0443606}, {346, 4.028964}, {340, 4.003672}, {205, 3.9212186}, {187, 3.882751}]      |
|5     |[{169, 4.6475086}, {430, 4.3971024}, {408, 4.3383975}, {4



+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                  |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1      |[{810, 5.160222}, {688, 5.0254846}, {357, 5.0184965}, {850, 4.954297}, {849, 4.942036}, {939, 4.920076}, {9, 4.905365}, {173, 4.884948}, {324, 4.8621187}, {477, 4.8317237}]     |
|3      |[{366, 4.46803}, {324, 4.3048806}, {137, 4.2472157}, {332, 4.180016}, {677, 4.1679173}, {372, 4.1348}, {859, 4.1266055}, {777, 4.108748}, {628, 4.1043367}, {472, 4.1040664}]    |
|5      |[{688, 4.6776476}, {849, 4.5631094}, {907, 4.504848

                                                                                