In [1]:
# Install Java
!apt-get install openjdk-11-jdk -y > /dev/null

# Download Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar -xzf spark-3.4.1-bin-hadoop3.tgz

# Install Python dependencies
!pip install -q pyspark


In [2]:
import os
from pyspark.sql import SparkSession

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"
os.environ["PATH"] += f":{os.environ['SPARK_HOME']}/bin"

spark = SparkSession.builder \
    .appName("SparkRecommendationEngine") \
    .master("local[*]") \
    .getOrCreate()


In [3]:
# Download MovieLens 100k dataset
!wget -q http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q ml-100k.zip -d /content/

# Load MovieLens dataset (users, movies, and ratings)
ratings = spark.read.csv('/content/ml-100k/u.data', sep='\t', inferSchema=True, header=False)
ratings = ratings.withColumnRenamed('_c0', 'userId').withColumnRenamed('_c1', 'movieId').withColumnRenamed('_c2', 'rating').withColumnRenamed('_c3', 'timestamp')

movies = spark.read.csv('/content/ml-100k/u.item', sep='|', inferSchema=True, header=False)
movies = movies.withColumnRenamed('_c0', 'movieId').withColumnRenamed('_c1', 'title')

ratings.show(5)
movies.show(5)


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
+------+-------+------+---------+
only showing top 5 rows

+-------+-----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|movieId|            title|        _c2| _c3|                 _c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|
+-------+-----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|      1| Toy Story (1995)|01-Jan-1995|null|http://us.imdb.co...|  0|  0|  0|  1|  1|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|
|      2| GoldenEye (1995)|01-Jan-1995|null|h

In [4]:
# Join ratings and movies dataset
data = ratings.join(movies, on='movieId').select('userId', 'movieId', 'rating', 'title')

# Show joined data
data.show(5)


+------+-------+------+--------------------+
|userId|movieId|rating|               title|
+------+-------+------+--------------------+
|   196|    242|     3|        Kolya (1996)|
|   186|    302|     3|L.A. Confidential...|
|    22|    377|     1| Heavyweights (1994)|
|   244|     51|     2|Legends of the Fa...|
|   166|    346|     1| Jackie Brown (1997)|
+------+-------+------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.recommendation import ALS

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# Initialize the ALS model
als = ALS(userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')

# Train the model
model = als.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Show some predictions
predictions.show(5)


+------+-------+------+--------------------+----------+
|userId|movieId|rating|               title|prediction|
+------+-------+------+--------------------+----------+
|   148|     70|     5|Four Weddings and...| 3.2065504|
|   148|     71|     5|Lion King, The (1...| 3.5808196|
|   148|     89|     5| Blade Runner (1982)| 4.1839113|
|   148|    114|     5|Wallace & Gromit:...|  4.797039|
|   148|    177|     2|Good, The Bad and...|  3.337861|
+------+-------+------+--------------------+----------+
only showing top 5 rows



In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

# RMSE evaluation
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse:.3f}")


Root Mean Squared Error (RMSE) = 0.924


In [7]:
# Get top 10 movie recommendations for a user (e.g., user 1)
user_recs = model.recommendForUserSubset(data.filter(data.userId == 1), 10)

# Show recommendations for user 1
user_recs.show(10, truncate=False)


+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                       |
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{1589, 5.509641}, {1449, 5.229162}, {613, 5.0571914}, {1463, 5.0504336}, {119, 4.8934026}, {169, 4.8902206}, {1405, 4.8798966}, {302, 4.851243}, {1137, 4.837176}, {1398, 4.8334484}]|
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [8]:
# Get top 10 movie recommendations for all users
all_user_recs = model.recommendForAllUsers(10)

# Show recommendations for a few users
all_user_recs.show(10, truncate=False)


+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                        |
+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{1589, 5.509641}, {1449, 5.229162}, {613, 5.0571914}, {1463, 5.0504336}, {119, 4.8934026}, {169, 4.8902206}, {1405, 4.8798966}, {302, 4.851243}, {1137, 4.837176}, {1398, 4.8334484}] |
|3     |[{1591, 4.540423}, {838, 4.365869}, {114, 4.2859}, {1388, 4.256133}, {593, 4.220229}, {320, 4.218633}, {902, 4.194911}, {1368, 4.1844187}, {1143, 4.1816006}, {205, 4.1202497}]        |
|5     |[{838, 4.8842177}, {851, 4.