In [1]:
spark

Waiting for a Spark session to start...

In [2]:
// in Scala
import org.apache.spark.ml.recommendation.ALS
val ratings = spark.read.textFile("/user/kranthidr/dataSets/spark-guide/data/sample_movielens_ratings.txt")
  .selectExpr("split(value , '::') as col")
  .selectExpr(
    "cast(col[0] as int) as userId",
    "cast(col[1] as int) as movieId",
    "cast(col[2] as float) as rating",
    "cast(col[3] as long) as timestamp")

Waiting for a Spark session to start...

ratings = [userId: int, movieId: int ... 2 more fields]


[userId: int, movieId: int ... 2 more fields]

In [3]:
val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))

training = [userId: int, movieId: int ... 2 more fields]
test = [userId: int, movieId: int ... 2 more fields]


[userId: int, movieId: int ... 2 more fields]

In [4]:
val als = new ALS()
.setColdStartStrategy("drop")
  .setMaxIter(10)
  .setRegParam(0.01)
  .setUserCol("userId")
  .setItemCol("movieId")
  .setRatingCol("rating")

als = als_28cbfdbb90a3


als_28cbfdbb90a3

In [5]:
println(als.explainParams())

alpha: alpha for implicit preference (default: 1.0)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext (default: 10)
coldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: nan,drop. (default: nan, current: drop)
finalStorageLevel: StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)
implicitPrefs: whether to use implicit preference (default: false)
intermediateStorageLevel: StorageLevel for intermediate datasets. Cannot be 'NONE'. (default: MEMORY_AND_DISK)
itemCol: column name for item ids. Ids must be within the integer value range. (default: item, current: movieId)
maxIter: maximum number of ite

In [6]:
val alsModel = als.fit(training)
val predictions = alsModel.transform(test)

alsModel = als_28cbfdbb90a3
predictions = [userId: int, movieId: int ... 3 more fields]


[userId: int, movieId: int ... 3 more fields]

In [7]:
// COMMAND ----------

// in Scala
alsModel.recommendForAllUsers(10)
  .selectExpr("userId", "explode(recommendations)").show()

+------+---------------+
|userId|            col|
+------+---------------+
|    28|[76, 6.5622416]|
|    28| [29, 5.886418]|
|    28| [51, 4.975867]|
|    28| [81, 4.947612]|
|    28| [24, 4.660326]|
|    28| [53, 4.616293]|
|    28|[72, 4.5752296]|
|    28| [4, 4.2370954]|
|    28|[49, 3.9188836]|
|    28|[55, 3.7959673]|
|    26| [46, 7.144871]|
|    26|[30, 6.4189463]|
|    26| [94, 6.096276]|
|    26| [74, 5.691221]|
|    26|[22, 5.2658143]|
|    26|[88, 5.2166443]|
|    26| [23, 5.041458]|
|    26| [32, 4.800029]|
|    26|[24, 4.7965646]|
|    26|  [7, 4.787397]|
+------+---------------+
only showing top 20 rows



In [8]:
alsModel.recommendForAllItems(10)
  .selectExpr("movieId", "explode(recommendations)").show()

+-------+---------------+
|movieId|            col|
+-------+---------------+
|     31|[12, 3.7455027]|
|     31|[15, 3.0927289]|
|     31|[14, 2.9269235]|
|     31| [6, 2.9064713]|
|     31| [7, 2.6342008]|
|     31|[25, 2.2124536]|
|     31|[16, 2.2068238]|
|     31| [8, 1.6802181]|
|     31|[27, 1.4390929]|
|     31| [2, 1.2938327]|
|     85|[16, 5.0699587]|
|     85| [8, 4.8067217]|
|     85|[14, 4.6430855]|
|     85|  [7, 4.333009]|
|     85|[17, 3.6258996]|
|     85| [1, 3.0400844]|
|     85| [6, 2.9789066]|
|     85|[19, 2.3474717]|
|     85|  [3, 2.076093]|
|     85|[20, 2.0634823]|
+-------+---------------+
only showing top 20 rows



In [9]:
// COMMAND ----------

// in Scala
import org.apache.spark.ml.evaluation.RegressionEvaluator
val evaluator = new RegressionEvaluator()
  .setMetricName("rmse")
  .setLabelCol("rating")
  .setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
println(s"Root-mean-square error = $rmse")

Root-mean-square error = 1.8194261925717494


evaluator = regEval_a6e4bc270eb1
rmse = 1.8194261925717494


1.8194261925717494

In [10]:
// COMMAND ----------

// in Scala
import org.apache.spark.mllib.evaluation.{
  RankingMetrics,
  RegressionMetrics}
val regComparison = predictions.select("rating", "prediction")
  .rdd.map(x => (x.getFloat(0).toDouble,x.getFloat(1).toDouble))

regComparison = MapPartitionsRDD[309] at map at <console>:42


MapPartitionsRDD[309] at map at <console>:42

In [11]:
val metrics = new RegressionMetrics(regComparison)

metrics = org.apache.spark.mllib.evaluation.RegressionMetrics@7a93d619


org.apache.spark.mllib.evaluation.RegressionMetrics@7a93d619

In [12]:
metrics.rootMeanSquaredError

1.819426192571749

In [13]:
println(metrics.explainedVariance)
println(metrics.meanAbsoluteError)
println(metrics.meanSquaredError)
println(metrics.r2)
println(metrics.rootMeanSquaredError)

1.3804192696533988
1.347436827484674
3.310311670216131
-0.2792667010567924
1.819426192571749


In [14]:
// COMMAND ----------

// in Scala
import org.apache.spark.mllib.evaluation.{RankingMetrics, RegressionMetrics}
import org.apache.spark.sql.functions.{col, expr}
val perUserActual = predictions
  .where("rating > 2.5")
  .groupBy("userId")
  .agg(expr("collect_set(movieId) as movies"))

perUserActual = [userId: int, movies: array<int>]


[userId: int, movies: array<int>]

In [15]:
// COMMAND ----------

// in Scala
val perUserPredictions = predictions
  .orderBy(col("userId"), col("prediction").desc)
  .groupBy("userId")
  .agg(expr("collect_list(movieId) as movies"))

perUserPredictions = [userId: int, movies: array<int>]


[userId: int, movies: array<int>]

In [16]:
// COMMAND ----------

// in Scala
val perUserActualvPred = perUserActual.join(perUserPredictions, Seq("userId"))
  .map(row => (
    row(1).asInstanceOf[Seq[Integer]].toArray,
    row(2).asInstanceOf[Seq[Integer]].toArray.take(15)
  ))
val ranks = new RankingMetrics(perUserActualvPred.rdd)

perUserActualvPred = [_1: array<int>, _2: array<int>]
ranks = org.apache.spark.mllib.evaluation.RankingMetrics@2a25c1d7


org.apache.spark.mllib.evaluation.RankingMetrics@2a25c1d7

In [17]:
// COMMAND ----------

// in Scala
ranks.meanAveragePrecision

0.23934657934657927

In [18]:
ranks.precisionAt(5)


// COMMAND ----------

0.4666666666666668