## Collaborative Filtering RS with Alternating least squares

In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

userId::movieId::rating::timestamp

0::2::3::1424380312

In [2]:

lines = spark.read.text("sample_movielens_ratings.txt").rdd

parts = lines.map(lambda row: row.value.split("::"))

ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
                                     
ratings = spark.createDataFrame(ratingsRDD)

ratings.show()

+-------+------+----------+------+
|movieId|rating| timestamp|userId|
+-------+------+----------+------+
|      2|   3.0|1424380312|     0|
|      3|   1.0|1424380312|     0|
|      5|   2.0|1424380312|     0|
|      9|   4.0|1424380312|     0|
|     11|   1.0|1424380312|     0|
|     12|   2.0|1424380312|     0|
|     15|   1.0|1424380312|     0|
|     17|   1.0|1424380312|     0|
|     19|   1.0|1424380312|     0|
|     21|   1.0|1424380312|     0|
|     23|   1.0|1424380312|     0|
|     26|   3.0|1424380312|     0|
|     27|   1.0|1424380312|     0|
|     28|   1.0|1424380312|     0|
|     29|   1.0|1424380312|     0|
|     30|   1.0|1424380312|     0|
|     31|   1.0|1424380312|     0|
|     34|   1.0|1424380312|     0|
|     37|   1.0|1424380312|     0|
|     41|   2.0|1424380312|     0|
+-------+------+----------+------+
only showing top 20 rows



In [3]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [4]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
          
model = als.fit(training)

In [5]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.5406795490167329


In [6]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)