In [1]:
#LINK TO AMAZON DATASET: https://nijianmo.github.io/amazon/index.html#sample-metadata

In [2]:
import numpy as np
import pandas as pd
from random import randint

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, DoubleType, IntegerType
from pyspark.sql import functions as F

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator



spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [4]:
df = spark.read.csv('/home/luca/Downloads/ratings_Movies_and_TV.csv')

In [5]:
df.rdd.id()

14

In [6]:
df.schema

StructType(List(StructField(_c0,StringType,true),StructField(_c1,StringType,true),StructField(_c2,StringType,true),StructField(_c3,StringType,true)))

In [7]:
df.tail(10)

[Row(_c0='A17W587EH23J0Q', _c1='B00LT1JHLW', _c2='5.0', _c3='1405641600'),
 Row(_c0='A3E4Q2YOYCKXON', _c1='B00LT1JHLW', _c2='5.0', _c3='1405987200'),
 Row(_c0='A1U1UNV1RLCKRL', _c1='B00LT1JHLW', _c2='3.0', _c3='1406073600'),
 Row(_c0='A14THKG1X8861X', _c1='B00LT1JHLW', _c2='5.0', _c3='1405555200'),
 Row(_c0='A3DE438TF1A958', _c1='B00LT1JHLW', _c2='5.0', _c3='1405728000'),
 Row(_c0='AHCV1RTGY3PJ8', _c1='B00LT1JHLW', _c2='5.0', _c3='1405641600'),
 Row(_c0='A2RWCXDMANY0LW', _c1='B00LT1JHLW', _c2='5.0', _c3='1405987200'),
 Row(_c0='A3V9PIFRME2XCW', _c1='B00LT1JHLW', _c2='5.0', _c3='1405900800'),
 Row(_c0='A3ROPC55BE2OM9', _c1='B00LT1JHLW', _c2='5.0', _c3='1405728000'),
 Row(_c0='A2ARBNMH5Q5YM1', _c1='B00LVGP8EA', _c2='5.0', _c3='1405641600')]

In [8]:
#Changing column names - https://stackoverflow.com/questions/34077353/how-to-change-dataframe-column-names-in-pyspark

df = df.selectExpr("_c0 as ReviewerID", "_c1 as ProductID", "_c2 as Rating", "_c3 as unixReviewTime")

In [9]:
#Items of interest are ReviewerID and ProductID

df.show()

+--------------+----------+------+--------------+
|    ReviewerID| ProductID|Rating|unixReviewTime|
+--------------+----------+------+--------------+
|A3R5OBKS7OM2IR|0000143502|   5.0|    1358380800|
|A3R5OBKS7OM2IR|0000143529|   5.0|    1380672000|
| AH3QC2PC1VTGP|0000143561|   2.0|    1216252800|
|A3LKP6WPMP9UKX|0000143588|   5.0|    1236902400|
| AVIY68KEPQ5ZD|0000143588|   5.0|    1232236800|
|A1CV1WROP5KTTW|0000589012|   5.0|    1309651200|
| AP57WZ2X4G0AA|0000589012|   2.0|    1366675200|
|A3NMBJ2LCRCATT|0000589012|   5.0|    1393804800|
| A5Y15SAOMX6XA|0000589012|   2.0|    1307404800|
|A3P671HJ32TCSF|0000589012|   5.0|    1393718400|
|A3VCKTRD24BG7K|0000589012|   5.0|    1378425600|
| ANF0AGIV0JCH2|0000589012|   5.0|    1308182400|
|A3LDEBLV6MVUBE|0000589012|   5.0|    1208995200|
|A1R2XZWQ6NM5M1|0000589012|   5.0|    1224979200|
|A36L1XGA5AQIJY|0000589012|   1.0|    1393113600|
|A2HWI21H23GDS4|0000589012|   4.0|    1338681600|
|A1DNYFL3RSXRMO|0000589012|   5.0|    1208908800|


In [10]:
df.dtypes

[('ReviewerID', 'string'),
 ('ProductID', 'string'),
 ('Rating', 'string'),
 ('unixReviewTime', 'string')]

In [11]:
df.count()

4607047

In [12]:
# Choose amount of rows for analysis

df = df.take(400000)

In [13]:
df = spark.createDataFrame(df)

In [14]:
df = df.withColumn("ReviewerID", df["ReviewerID"].cast(IntegerType()))
df = df.withColumn("ProductID", df["ProductID"].cast(IntegerType()))
df = df.withColumn("Rating", df["Rating"].cast(IntegerType()))
df = df.withColumn("unixReviewTime", df["unixReviewTime"].cast(IntegerType()))

In [15]:
# Taken from: https://stackoverflow.com/questions/44153575/fill-na-with-random-numbers-in-pyspark
new_df = df.withColumn('ReviewerID', F.coalesce(F.col('ReviewerID'), (F.round(F.rand()*836006)))).collect()

In [16]:
#Items of interest are ReviewerID and ProductID

new_df = spark.createDataFrame(new_df)
new_df.show()

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
|   69512.0|   143502|     5|    1358380800|
|  770294.0|   143529|     5|    1380672000|
|   21790.0|   143561|     2|    1216252800|
|  386490.0|   143588|     5|    1236902400|
|  236149.0|   143588|     5|    1232236800|
|  827579.0|   589012|     5|    1309651200|
|  784769.0|   589012|     2|    1366675200|
|  330173.0|   589012|     5|    1393804800|
|  368866.0|   589012|     2|    1307404800|
|  575449.0|   589012|     5|    1393718400|
|  262829.0|   589012|     5|    1378425600|
|  633992.0|   589012|     5|    1308182400|
|  165972.0|   589012|     5|    1208995200|
|  396968.0|   589012|     5|    1224979200|
|   81001.0|   589012|     1|    1393113600|
|  509994.0|   589012|     4|    1338681600|
|  760026.0|   589012|     5|    1208908800|
|  354532.0|   589012|     1|    1218412800|
|  494464.0|   589012|     5|    1322956800|
|  648877.

In [17]:
del(df)

In [18]:
new_df.dtypes

[('ReviewerID', 'double'),
 ('ProductID', 'bigint'),
 ('Rating', 'bigint'),
 ('unixReviewTime', 'bigint')]

In [19]:
new_df = new_df.withColumn("ReviewerID", new_df["ReviewerID"].cast(IntegerType()))
new_df = new_df.withColumn("ProductID", new_df["ProductID"].cast(IntegerType()))
new_df = new_df.withColumn("Rating", new_df["Rating"].cast(DoubleType()))
new_df = new_df.withColumn("unixReviewTime", new_df["unixReviewTime"].cast(IntegerType()))

In [20]:
new_df.dtypes

[('ReviewerID', 'int'),
 ('ProductID', 'int'),
 ('Rating', 'double'),
 ('unixReviewTime', 'int')]

In [21]:
new_df.show()

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
|     69512|   143502|   5.0|    1358380800|
|    770294|   143529|   5.0|    1380672000|
|     21790|   143561|   2.0|    1216252800|
|    386490|   143588|   5.0|    1236902400|
|    236149|   143588|   5.0|    1232236800|
|    827579|   589012|   5.0|    1309651200|
|    784769|   589012|   2.0|    1366675200|
|    330173|   589012|   5.0|    1393804800|
|    368866|   589012|   2.0|    1307404800|
|    575449|   589012|   5.0|    1393718400|
|    262829|   589012|   5.0|    1378425600|
|    633992|   589012|   5.0|    1308182400|
|    165972|   589012|   5.0|    1208995200|
|    396968|   589012|   5.0|    1224979200|
|     81001|   589012|   1.0|    1393113600|
|    509994|   589012|   4.0|    1338681600|
|    760026|   589012|   5.0|    1208908800|
|    354532|   589012|   1.0|    1218412800|
|    494464|   589012|   5.0|    1322956800|
|    64887

In [22]:
#MUST FIT INTEGER RANGE: -2147483648 to 2147483647 - https://spark.apache.org/docs/latest/sql-ref-datatypes.html

new_df.count()

400000

In [23]:
(training, test) = new_df.randomSplit([0.8, 0.2])

In [24]:
als = ALS(userCol="ReviewerID", itemCol="ProductID", ratingCol="Rating",
          coldStartStrategy="nan")
model = als.fit(training)

Py4JJavaError: An error occurred while calling o112.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 12.0 failed 1 times, most recent failure: Lost task 1.0 in stage 12.0 (TID 34) (192.168.0.25 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(ALSModelParams$$Lambda$3152/0x0000000841215840: (int) => int)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:222)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: ALS only supports values in Integer range for columns ReviewerID and ProductID. Value null was not numeric.
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1(ALS.scala:104)
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1$adapted(ALS.scala:89)
	... 19 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1253)
	at org.apache.spark.ml.recommendation.ALS$.train(ALS.scala:960)
	at org.apache.spark.ml.recommendation.ALS.$anonfun$fit$1(ALS.scala:709)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.recommendation.ALS.fit(ALS.scala:691)
	at org.apache.spark.ml.recommendation.ALS.fit(ALS.scala:593)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(ALSModelParams$$Lambda$3152/0x0000000841215840: (int) => int)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:222)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.lang.IllegalArgumentException: ALS only supports values in Integer range for columns ReviewerID and ProductID. Value null was not numeric.
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1(ALS.scala:104)
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1$adapted(ALS.scala:89)
	... 19 more


In [25]:
"""ERROR: java.lang.IllegalArgumentException: 
ALS only supports values in Integer range for columns ReviewerID and ProductID.
Value null was not numeric."""

training.filter("ReviewerID is NULL").show()
training.filter("ProductID is NULL").show()

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
+----------+---------+------+--------------+

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
|       179|     null|   4.0|    1386892800|
|      1174|     null|   4.0|    1314835200|
|      1739|     null|   5.0|    1315267200|
|      2155|     null|   5.0|    1361145600|
|      2250|     null|   2.0|    1393977600|
|      2941|     null|   4.0|    1189728000|
|      3162|     null|   5.0|    1102032000|
|      3364|     null|   2.0|    1175040000|
|      3666|     null|   5.0|    1018483200|
|      3813|     null|   5.0|    1118448000|
|      4514|     null|   5.0|    1256774400|
|      4575|     null|   4.0|    1304812800|
|      4931|     null|   4.0|    1370217600|
|      6017|     null|   5.0|    1396137600|
|      6178|     null|   5.0|    1284336000|
|      63

In [26]:
training = training.dropna()
training.filter("ReviewerID is NULL").show()
training.filter("ProductID is NULL").show()

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
+----------+---------+------+--------------+

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
+----------+---------+------+--------------+



In [27]:
als = ALS(userCol="ReviewerID", itemCol="ProductID", ratingCol="Rating",
          coldStartStrategy="drop")

In [28]:
param_grid = ParamGridBuilder()\
.addGrid(als.rank, [12, 13, 14])\
.addGrid(als.maxIter, [18, 19, 20])\
.addGrid(als.regParam, [.17, .18, .19])\
.build()

In [29]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating", 
                                predictionCol="prediction")

In [30]:
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid,
                           evaluator=evaluator)

In [31]:
model = tvs.fit(training)

In [32]:
best_model = model.bestModel

In [33]:
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

Py4JJavaError: An error occurred while calling o232.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 3882.0 failed 1 times, most recent failure: Lost task 3.0 in stage 3882.0 (TID 24502) (192.168.0.25 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(ALSModelParams$$Lambda$3233/0x00000008412c3840: (int) => int)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$6(ShuffleExchangeExec.scala:291)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$6$adapted(ShuffleExchangeExec.scala:291)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$14(ShuffleExchangeExec.scala:360)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:156)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.IllegalArgumentException: ALS only supports values in Integer range for columns ReviewerID and ProductID. Value null was not numeric.
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1(ALS.scala:104)
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1$adapted(ALS.scala:89)
	... 16 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2291)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1183)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1177)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1222)
	at org.apache.spark.mllib.stat.Statistics$.colStats(Statistics.scala:58)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary$lzycompute(RegressionMetrics.scala:70)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.summary(RegressionMetrics.scala:62)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr$lzycompute(RegressionMetrics.scala:74)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.SSerr(RegressionMetrics.scala:74)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.meanSquaredError(RegressionMetrics.scala:106)
	at org.apache.spark.mllib.evaluation.RegressionMetrics.rootMeanSquaredError(RegressionMetrics.scala:115)
	at org.apache.spark.ml.evaluation.RegressionEvaluator.evaluate(RegressionEvaluator.scala:101)
	at jdk.internal.reflect.GeneratedMethodAccessor176.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(ALSModelParams$$Lambda$3233/0x00000008412c3840: (int) => int)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$6(ShuffleExchangeExec.scala:291)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$6$adapted(ShuffleExchangeExec.scala:291)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$14(ShuffleExchangeExec.scala:360)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:156)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.lang.IllegalArgumentException: ALS only supports values in Integer range for columns ReviewerID and ProductID. Value null was not numeric.
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1(ALS.scala:104)
	at org.apache.spark.ml.recommendation.ALSModelParams.$anonfun$checkedCast$1$adapted(ALS.scala:89)
	... 16 more


In [34]:
test.filter("ReviewerID is NULL").show()
test.filter("ProductID is NULL").show()

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
+----------+---------+------+--------------+

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
|      1026|     null|   4.0|    1218240000|
|      2689|     null|   3.0|    1127952000|
|      3720|     null|   3.0|    1252368000|
|      5817|     null|   5.0|    1235174400|
|      6771|     null|   5.0|    1195862400|
|      7137|     null|   5.0|    1367798400|
|      8482|     null|   5.0|     958262400|
|      9863|     null|   5.0|    1221782400|
|     11563|     null|   2.0|    1197763200|
|     11564|     null|   5.0|    1372809600|
|     13535|     null|   5.0|    1116979200|
|     14703|     null|   4.0|     951177600|
|     14756|     null|   1.0|    1187481600|
|     15079|     null|   5.0|    1378598400|
|     15178|     null|   5.0|    1122854400|
|     162

In [35]:
test = test.dropna()
test.filter("ReviewerID is NULL").show()
test.filter("ProductID is NULL").show()

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
+----------+---------+------+--------------+

+----------+---------+------+--------------+
|ReviewerID|ProductID|Rating|unixReviewTime|
+----------+---------+------+--------------+
+----------+---------+------+--------------+



In [36]:
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

In [37]:
print("RMSE = " + str(rmse))
print("**Best Model**")
print(" Rank:", best_model.rank)
print(" MaxIter:", best_model._java_obj.parent().getMaxIter())
print(" RegParam:", best_model._java_obj.parent().getRegParam())

RMSE = 2.1323883053001094
**Best Model**
 Rank: 14
 MaxIter: 20
 RegParam: 0.19


In [38]:
display(predictions.sort("ReviewerID", "Rating").show(n=40))

+----------+----------+------+--------------+----------+
|ReviewerID| ProductID|Rating|unixReviewTime|prediction|
+----------+----------+------+--------------+----------+
|        46| 783222955|   5.0|    1355356800| 2.5241246|
|       116| 793906091|   1.0|    1091404800| 2.4630482|
|       139| 578002019|   5.0|    1253404800|  4.234728|
|       184| 792836685|   4.0|    1281139200| 1.8477458|
|       245| 790744309|   5.0|    1139356800| 0.5047499|
|       292| 800102150|   5.0|    1369008000|  4.800295|
|       319| 938045245|   1.0|     967939200|-0.7261893|
|       379| 783225482|   5.0|    1376611200|  3.777586|
|       399| 792175220|   5.0|    1036454400|  2.900791|
|       463| 792159659|   1.0|     950140800| 1.7115067|
|       484| 790742403|   5.0|    1224979200|  2.525773|
|       517| 800141660|   5.0|    1383004800| 4.1422935|
|       670| 790743507|   5.0|    1356480000| 2.3599243|
|       873| 790701022|   4.0|    1233964800|  2.727374|
|       907| 800128052|   5.0| 

None