In [21]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS # Alternating least squared
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
spark = SparkSession.builder.appName("app").getOrCreate()

In [5]:
df = spark.read.csv("./Python-and-Spark-for-Big-Data-master/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Recommender_Systems/movielens_ratings.csv",
              inferSchema=True, header=True)

In [6]:
df.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [7]:
df.count()

1501

In [8]:
df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [12]:
df.columns

['movieId', 'rating', 'userId']

In [18]:
df.select([pyspark.sql.functions.sum(df[col].isNull().cast("integer")).alias(col) for col in df.columns]).show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      0|     0|     0|
+-------+------+------+



In [26]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=1)

In [22]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [24]:
model = als.fit(train_df)

In [27]:
prediction = model.transform(test_df)

In [28]:
prediction.show()

+-------+------+------+------------+
|movieId|rating|userId|  prediction|
+-------+------+------+------------+
|      0|   3.0|    28|   1.2480105|
|      0|   1.0|    27|  0.83925146|
|      5|   2.0|    22|    1.818285|
|      2|   1.0|    16|  0.68108505|
|      2|   3.0|     6|    1.642898|
|      2|   1.0|     3|    2.638785|
|      2|   1.0|    17|   3.3182456|
|      3|   1.0|     9|   2.2142448|
|      1|   1.0|     4|   1.6648022|
|      2|   4.0|     8|    2.037578|
|      3|   2.0|     8|  0.07634701|
|      7|   1.0|     8|   1.1374704|
|      0|   3.0|    10|   1.0688026|
|      4|   3.0|    10|   1.6321839|
|      5|   1.0|    29|-0.096546054|
|      3|   1.0|    21|   2.3792627|
|      0|   1.0|    11|   1.2660939|
|      4|   1.0|    14|   0.8775302|
|      5|   1.0|    14|   1.7154177|
|      4|   3.0|     2|   2.8543346|
+-------+------+------+------------+
only showing top 20 rows



In [29]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating")

In [34]:
evaluator.evaluate(prediction)

1.7563629531848577