In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, exp, lit
from pyspark.sql.types import IntegerType


In [2]:
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

data_path = '../data/ml-100k/u.data'
columns = ["userId", "movieId", "rating", "timestamp"]
ratings = spark.read.csv(data_path, sep="\t", inferSchema=True).toDF(*columns)

ratings = ratings.withColumn("userId", col("userId").cast(IntegerType()))
ratings = ratings.withColumn("movieId", col("movieId").cast(IntegerType()))

max_timestamp = ratings.agg({'timestamp': 'max'}).collect()[0][0]
ratings = ratings.withColumn('weight', exp(-(lit(max_timestamp) - col('timestamp')) / lit(10**6)))

24/11/25 17:27:39 WARN Utils: Your hostname, MacBook-Air-Nasdorm.local resolves to a loopback address: 127.0.0.1; using 192.168.93.71 instead (on interface en0)
24/11/25 17:27:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/25 17:27:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
user_activity = ratings.groupBy("userId").count()
user_activity.orderBy("count", ascending=False).show(5)

movie_popularity = ratings.groupBy("movieId").count()
movie_popularity.orderBy("count", ascending=False).show(5)


active_users = user_activity.filter(col("count") >= 5).select("userId")
filtered_ratings = ratings.join(active_users, on="userId", how="inner")

popular_movies = movie_popularity.filter(col("count") >= 5).select("movieId")
filtered_ratings = filtered_ratings.join(popular_movies, on="movieId", how="inner")


print(f"Количество записей после фильтрации: {filtered_ratings.count()}")
filtered_ratings.show(5)



+------+-----+
|userId|count|
+------+-----+
|   405|  737|
|   655|  685|
|    13|  636|
|   450|  540|
|   276|  518|
+------+-----+
only showing top 5 rows

+-------+-----+
|movieId|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
+-------+-----+
only showing top 5 rows

Количество записей после фильтрации: 99287
+-------+------+------+---------+--------------------+
|movieId|userId|rating|timestamp|              weight|
+-------+------+------+---------+--------------------+
|    242|   196|     3|881250949|5.928798377342577E-6|
|    302|   186|     3|891717742| 0.20827499106941064|
|    377|    22|     1|878887116|5.576568655535087E-7|
|     51|   244|     2|880606923|3.113649646606838E-6|
|    346|   166|     1|886397596|0.001018889469706...|
+-------+------+------+---------+--------------------+
only showing top 5 rows



In [4]:
avg_ratings = filtered_ratings.groupBy("userId").avg("rating").withColumnRenamed("avg(rating)", "avg_user_rating")
normalized_ratings = filtered_ratings.join(avg_ratings, on="userId")
normalized_ratings = normalized_ratings.withColumn("normalized_rating", col("rating") - col("avg_user_rating"))


In [5]:
# Split data into training and testing sets
(train_data, test_data) = normalized_ratings.randomSplit([0.8, 0.2], seed=42)

# Build ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop" 
)
model = als.fit(train_data)

# Evaluate the model
predictions = model.transform(test_data)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error: {rmse}")


24/11/25 17:27:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/11/25 17:27:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/11/25 17:27:47 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Root-mean-square error: 0.9170891528602931


In [6]:
# Generate top recommendations for each user
user_recs = model.recommendForAllUsers(10)
user_recs.show(5, truncate=False)

# Generate top recommendations for each movie
movie_recs = model.recommendForAllItems(10)
movie_recs.show(5, truncate=False)

                                                                                

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                   |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{1449, 5.018088}, {408, 4.993631}, {169, 4.863122}, {114, 4.8179073}, {1344, 4.7656116}, {50, 4.7370343}, {513, 4.734218}, {285, 4.7136936}, {127, 4.652685}, {1251, 4.644831}]  |
|3     |[{1143, 4.684608}, {902, 4.644995}, {320, 4.6175647}, {50, 4.3271046}, {42, 4.2752905}, {172, 4.232882}, {173, 4.2081876}, {641, 4.1955733}, {1063, 4.182619}, {195, 4.139439}]   |
|5     |[{1154, 4.645989}, {745, 4.3309865}, {169, 4.3143377



+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                 |
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1      |[{849, 5.0578346}, {688, 4.93358}, {507, 4.92387}, {810, 4.881707}, {477, 4.878927}, {152, 4.848731}, {261, 4.8326106}, {324, 4.832352}, {850, 4.798274}, {532, 4.796868}]      |
|3      |[{472, 4.6898985}, {434, 4.678906}, {67, 4.668692}, {127, 4.394253}, {367, 4.3927336}, {113, 4.322141}, {550, 4.3025637}, {628, 4.2602863}, {96, 4.221845}, {39, 4.219396}]     |
|5      |[{688, 4.9568906}, {849, 4.6521316}, {507, 4.612246}, {9

                                                                                

24/11/25 17:27:51 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
