In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

spark = SparkSession.builder \
    .appName("MovieLens ALS") \
    .getOrCreate()

schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True)
])

ratings = spark.read.csv(
    "ml-100k/u.data",
    sep="\t",
    schema=schema
)

ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows



In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np

train, test = ratings.randomSplit([0.8, 0.2], seed=42)

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=8,
    maxIter=10,
    regParam=0.1,
    coldStartStrategy="drop"
)

model = als.fit(train)

predictions = model.transform(test)

In [3]:
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print("RMSE =", rmse)

RMSE = 0.9188151803239948


In [4]:
predictions.select("userId", "movieId", "rating", "prediction").show(10)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   148|      8|     4|  4.041426|
|   148|     56|     5| 3.6975405|
|   148|     71|     5| 3.0938647|
|   148|    133|     5|  2.968647|
|   148|    169|     5| 4.9386854|
|   148|    172|     5|  4.345913|
|   148|    194|     5| 4.0948253|
|   148|    222|     4| 3.7425344|
|   148|    357|     5|  3.476279|
|   148|    432|     5| 3.6443253|
+------+-------+------+----------+
only showing top 10 rows



In [5]:
user_recs = model.recommendForAllUsers(5)
user_recs.show(5, truncate=False)

+------+---------------------------------------------------------------------------------------------+
|userId|recommendations                                                                              |
+------+---------------------------------------------------------------------------------------------+
|1     |[{1449, 4.9850845}, {408, 4.9462695}, {119, 4.897621}, {12, 4.874115}, {919, 4.854416}]      |
|2     |[{1643, 5.633188}, {1398, 4.844986}, {119, 4.808659}, {483, 4.7604294}, {318, 4.7435656}]    |
|3     |[{1607, 4.313891}, {1367, 4.1896677}, {1612, 4.021066}, {1005, 4.0160885}, {1169, 3.9593577}]|
|4     |[{1449, 6.1030593}, {1585, 5.9335213}, {320, 5.8011727}, {1558, 5.722536}, {1193, 5.7190585}]|
|5     |[{50, 4.35868}, {173, 4.3089123}, {408, 4.3013625}, {169, 4.2612476}, {613, 4.2505383}]      |
+------+---------------------------------------------------------------------------------------------+
only showing top 5 rows



In [9]:
from surprise import Dataset, SVD, NMF, KNNBasic
from surprise.model_selection import cross_validate

data = Dataset.load_builtin("ml-100k")

algorithms = {
    "SVD": SVD(),
    "NMF": NMF(),
    "kNN": KNNBasic()
}

for name, algo in algorithms.items():
    print(f"\n{name}")
    cross_validate(
        algo,
        data,
        measures=["RMSE", "MAE"],
        cv=5,
        verbose=True
    )

Dataset ml-100k could not be found. Do you want to download it? [Y/n] 

 y


Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\Milos/.surprise_data/ml-100k

SVD
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9403  0.9360  0.9269  0.9353  0.9365  0.9350  0.0044  
MAE (testset)     0.7428  0.7359  0.7289  0.7376  0.7385  0.7367  0.0045  
Fit time          0.97    0.85    0.97    0.86    0.96    0.92    0.06    
Test time         0.18    0.09    0.16    0.10    0.11    0.13    0.04    

NMF
Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9582  0.9650  0.9699  0.9558  0.9635  0.9625  0.0050  
MAE (testset)     0.7553  0.7555  0.7604  0.7512  0.7589  0.7563  0.0032  
Fit time          1.60    1.57    1.59    1.54    1.52    1.56    0.03    
Test time         0.14    0.08    0