In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType


In [2]:
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

data_path = '../data/ml-100k/u.data'
columns = ["userId", "movieId", "rating", "timestamp"]
ratings = spark.read.csv(data_path, sep="\t", inferSchema=True).toDF(*columns)

ratings = ratings.withColumn("userId", col("userId").cast(IntegerType()))
ratings = ratings.withColumn("movieId", col("movieId").cast(IntegerType()))


24/11/25 13:59:50 WARN Utils: Your hostname, MacBook-Air-Nasdorm.local resolves to a loopback address: 127.0.0.1; using 192.168.93.71 instead (on interface en0)
24/11/25 13:59:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/25 13:59:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
user_activity = ratings.groupBy("userId").count()
user_activity.orderBy("count", ascending=False).show(5)

movie_popularity = ratings.groupBy("movieId").count()
movie_popularity.orderBy("count", ascending=False).show(5)


active_users = user_activity.filter(col("count") >= 5).select("userId")
filtered_ratings = ratings.join(active_users, on="userId", how="inner")

popular_movies = movie_popularity.filter(col("count") >= 5).select("movieId")
filtered_ratings = filtered_ratings.join(popular_movies, on="movieId", how="inner")


print(f"Количество записей после фильтрации: {filtered_ratings.count()}")
filtered_ratings.show(5)



+------+-----+
|userId|count|
+------+-----+
|   405|  737|
|   655|  685|
|    13|  636|
|   450|  540|
|   276|  518|
+------+-----+
only showing top 5 rows

+-------+-----+
|movieId|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
+-------+-----+
only showing top 5 rows

Количество записей после фильтрации: 99287
+-------+------+------+---------+
|movieId|userId|rating|timestamp|
+-------+------+------+---------+
|    242|   196|     3|881250949|
|    302|   186|     3|891717742|
|    377|    22|     1|878887116|
|     51|   244|     2|880606923|
|    346|   166|     1|886397596|
+-------+------+------+---------+
only showing top 5 rows



In [4]:
avg_ratings = filtered_ratings.groupBy("userId").avg("rating").withColumnRenamed("avg(rating)", "avg_user_rating")
normalized_ratings = filtered_ratings.join(avg_ratings, on="userId")
normalized_ratings = normalized_ratings.withColumn("normalized_rating", col("rating") - col("avg_user_rating"))


In [5]:
# Split data into training and testing sets
(train_data, test_data) = normalized_ratings.randomSplit([0.8, 0.2], seed=42)

# Build ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop" 
)
model = als.fit(train_data)

# Evaluate the model
predictions = model.transform(test_data)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error: {rmse}")


24/11/25 13:59:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/11/25 13:59:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/11/25 13:59:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Root-mean-square error: 0.9147557331634427


In [6]:
# Generate top recommendations for each user
user_recs = model.recommendForAllUsers(10)
user_recs.show(5, truncate=False)

# Generate top recommendations for each movie
movie_recs = model.recommendForAllItems(10)
movie_recs.show(5, truncate=False)

                                                                                

+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                    |
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{1449, 5.1250315}, {169, 4.7479334}, {100, 4.6667585}, {89, 4.6557856}, {408, 4.6466265}, {483, 4.6421537}, {1240, 4.635898}, {178, 4.6087627}, {50, 4.6071877}, {513, 4.6006184}]|
|3     |[{1143, 4.5634527}, {641, 4.5434794}, {902, 4.301408}, {320, 4.2987423}, {50, 4.274526}, {205, 4.1952877}, {187, 4.1595035}, {127, 4.135337}, {1142, 4.08389}, {172, 4.0836616}]   |
|5     |[{115, 4.4412465}, {434, 4.2793055}, {1143, 4.2



+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                |
+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1      |[{810, 5.379319}, {357, 5.0077295}, {849, 5.006954}, {507, 4.9135175}, {477, 4.852283}, {261, 4.8506227}, {850, 4.820545}, {688, 4.7923512}, {534, 4.760875}, {152, 4.754964}] |
|3      |[{688, 4.4121485}, {143, 4.410615}, {628, 4.339325}, {472, 4.309032}, {534, 4.2773814}, {677, 4.2552695}, {67, 4.2398486}, {507, 4.192024}, {367, 4.1742344}, {502, 4.160402}] |
|5      |[{688, 4.769601}, {849, 4.7122188}, {462, 4.6811543}, {939, 4

                                                                                

24/11/25 14:00:09 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
