In [1]:
# Check the environment
!java --version
!python --version

openjdk 11.0.18 2023-01-17
OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1)
OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
Python 3.9.16


In [3]:
# Download Apache Spark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=61298d72f5bffc3d56c735c5604bd82f5ee97167102ef11da8de22e12401c4ae
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [4]:
# Initiate the Spark Session
from pyspark.sql import SparkSession

# Create Spark Session/Context
spark = SparkSession.builder \
  .master("local") \
  .appName("Hello PySpark") \
  .config ("spark.some.config.option", "some-value") \
  .getOrCreate()

In [5]:
# Check spark session
print(spark)

<pyspark.sql.session.SparkSession object at 0x7f72c80a79a0>


In [6]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [7]:
!pip install wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9676 sha256=56a5eb5e1d047e4efbe2c408b6654913a4dbe89383a597e8bee2fb5ade4c7d50
  Stored in directory: /root/.cache/pip/wheels/04/5f/3e/46cc37c5d698415694d83f607f833f83f0149e49b3af9d0f38
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [8]:
import wget

In [9]:
file = wget.download("https://raw.githubusercontent.com/apache/spark/master/data/mllib/als/sample_movielens_ratings.txt")

In [10]:
lines = spark.read.text("/content/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])


In [19]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=18, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)


In [20]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.6507297094819995


In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 3.687074}, ...|
|    10|[{92, 3.3655536},...|
|     0|[{92, 3.2275326},...|
|     1|[{62, 3.2108698},...|
|    21|[{29, 4.6235642},...|
|    11|[{32, 4.8050656},...|
|    12|[{46, 4.867862}, ...|
|    22|[{88, 4.2667727},...|
|     2|[{93, 4.458805}, ...|
|    13|[{93, 3.1507952},...|
|     3|[{51, 3.9332511},...|
|    23|[{32, 4.9350796},...|
|     4|[{29, 3.5907602},...|
|    24|[{69, 4.4119983},...|
|    14|[{52, 4.4537644},...|
|     5|[{55, 3.9585}, {4...|
|    15|[{46, 4.1165385},...|
|    25|[{47, 2.8637855},...|
|    26|[{88, 4.7184362},...|
|     6|[{25, 3.8741229},...|
+------+--------------------+
only showing top 20 rows



In [None]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10).show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 3.8929477},...|
|     40|[{2, 3.5130334}, ...|
|     10|[{12, 3.4377112},...|
|     50|[{23, 3.891326}, ...|
|     80|[{26, 3.395955}, ...|
|     70|[{21, 3.5905616},...|
|     60|[{21, 2.7525523},...|
|     90|[{16, 4.52701}, {...|
|     30|[{26, 4.5177093},...|
|      0|[{28, 2.4868083},...|
|     31|[{12, 2.7964075},...|
|     81|[{28, 3.963631}, ...|
|     91|[{12, 2.6666212},...|
|      1|[{12, 2.700808}, ...|
|     41|[{21, 3.6565263},...|
|     61|[{6, 1.9976146}, ...|
|     51|[{26, 4.6777315},...|
|     21|[{26, 2.623012}, ...|
|     11|[{18, 3.3091846},...|
|     71|[{25, 2.8062804},...|
+-------+--------------------+
only showing top 20 rows



In [None]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{88, 4.7184362},...|
|    19|[{90, 3.1640222},...|
|    29|[{46, 4.1753683},...|
+------+--------------------+



In [None]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10).show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{23, 4.1052895},...|
|     26|[{15, 2.23296}, {...|
|     29|[{21, 4.6235642},...|
+-------+--------------------+



In a nutshell, as per experimental lower values of RMSE indicate better accuracy of the regression model.


maxIter=5, regParam=0.1 the RMSE is = 1.0536207567527969

maxIter=5, regParam=0.5 the RMSE is = 1.3587065977248067

maxIter=10, regParam=0.1 the RMSE is = 1.0738216177715993

maxIter=10, regParam=0.5 the RMSE is = 1.357096647241042

maxIter=18, regParam=1.0 the RMSE is = 1.6507297094819995

The model that performs better according to the RMSE is the model with maxIter=5, regParam=0.1.