# Import packages

In [None]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.types.IntegerType

Intitializing Scala interpreter ...

# Create Class Rating

In [2]:
case class Rating(userId: Int, movieId: Int, rating: Float, timestamp: Long)
def parseRating(str: String): Rating = {
  val fields = str.split("::")
  assert(fields.size == 4)
  Rating(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
}

defined class Rating
parseRating: (str: String)Rating


#  Reading of ratings.cvs and transformed string to int


In [3]:
val df = spark.read.option("header",true)
    .csv("ratings.csv")
  
val ratin = df.withColumn("userId",col("userId").cast(IntegerType))
val rating = ratin.withColumn("movieId",col("movieId").cast(IntegerType))
val ratings = rating.withColumn("rating",col("rating").cast(IntegerType))

df: org.apache.spark.sql.DataFrame = [userId: string, movieId: string ... 2 more fields]
ratin: org.apache.spark.sql.DataFrame = [userId: int, movieId: string ... 2 more fields]
rating: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]
ratings: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 2 more fields]


# Data training 

In [4]:
val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))


training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: int, movieId: int ... 2 more fields]


# Construction of models

In [5]:
val als = new ALS()
  .setMaxIter(5)
  .setRegParam(0.01)
  .setImplicitPrefs(true)
  .setUserCol("userId")
  .setItemCol("movieId")
  .setRatingCol("rating")

als: org.apache.spark.ml.recommendation.ALS = als_51e279af3b04


In [6]:
val model = als.fit(training)

model: org.apache.spark.ml.recommendation.ALSModel = ALSModel: uid=als_51e279af3b04, rank=10


In [7]:
model.setColdStartStrategy("drop")
val predictions = model.transform(test)

val evaluator = new RegressionEvaluator()
  .setMetricName("rmse")
  .setLabelCol("rating")
  .setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
println(s"Root-mean-square error = $rmse")

// Generate top 10 movie recommendations for each user
val userRecs = model.recommendForAllUsers(10)
// Generate top 10 user recommendations for each movie
val movieRecs = model.recommendForAllItems(10)

userRecs.show(4)

Root-mean-square error = 3.1024561929844605
+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[{318, 0.55660677...|
|   463|[{2571, 0.4156339...|
|   496|[{2571, 0.3543136...|
|   148|[{68954, 0.567288...|
+------+--------------------+
only showing top 4 rows



predictions: org.apache.spark.sql.DataFrame = [userId: int, movieId: int ... 3 more fields]
evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = RegressionEvaluator: uid=regEval_7f9c970b8a72, metricName=rmse, throughOrigin=false
rmse: Double = 3.1024561929844605
userRecs: org.apache.spark.sql.DataFrame = [userId: int, recommendations: array<struct<movieId:int,rating:float>>]
movieRecs: org.apache.spark.sql.DataFrame = [movieId: int, recommendations: array<struct<userId:int,rating:float>>]


# UserRecs: Recommandation of movies to every user
# movieRecs: Recommandation of users to every movie

In [8]:
userRecs.show(10)
movieRecs.show(10)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[{318, 0.55660677...|
|   463|[{2571, 0.4156339...|
|   496|[{2571, 0.3543136...|
|   148|[{68954, 0.567288...|
|   540|[{2571, 0.5999577...|
|   392|[{2571, 0.2923303...|
|   243|[{165, 0.7191207}...|
|    31|[{356, 0.67481244...|
|   516|[{2571, 0.2730559...|
|   580|[{2571, 1.4054843...|
+------+--------------------+
only showing top 10 rows

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[{590, 1.2592335}...|
|   4900|[{111, 0.45070204...|
|   5300|[{387, 0.18794394...|
|   6620|[{474, 1.0671625}...|
|   7340|[{474, 0.85126686...|
|  32460|[{105, 0.5851133}...|
|  54190|[{89, 0.7411296},...|
|    471|[{474, 1.3261654}...|
|   1591|[{274, 1.0213125}...|
| 140541|[{89, 0.42056593}...|
+-------+--------------------+
only showing top 10 rows

