In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Recommendation Systems Example") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x000001A491456BE0>


In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import types

# Praproses

In [4]:
# Load Data
# Kita hilangkan data yang memiliki nilai kosong dengan tambahan .na.drop()
ratings = spark.read.csv("D://5115100117//Big Data//Tugas 4 Big Data//ratings.csv", header=True, inferSchema=True).na.drop()

In [5]:
ratings = ratings.withColumn("userId", ratings["userId"].cast("int"))
ratings = ratings.withColumn("movieId", ratings["movieId"].cast("int"))
ratings = ratings.withColumn("rating", ratings["rating"].cast("float"))

ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
|     1|   1590|   2.5|1256677236|
|     1|   1591|   1.5|1256677475|
|     1|   2134|   4.5|1256677464|
|     1|   2478|   4.0|1256677239|
|     1|   2840|   3.0|1256677500|
|     1|   2986|   2.5|1256677496|
|     1|   3020|   4.0|1256677260|
|     1|   3424|   4.5|1256677444|
|     1|   3698|   3.5|1256677243|
|     1|   3826|   2.0|1256677210|
|     1|   3893|   3.5|1256677486|
|     2|    170|   3.5|1192913581|
|     2|    849|   3.5|1192913537|
|     2|   1186|   3.5|1192913611|
|     2|   1235|   3.0|1192913585|
+------+-------+------+----------+
only showing top 20 rows



# CreateModel

In [6]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [7]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [8]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8367711012723046


In [9]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [10]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[[182521, 20.8335...|
|   463|[[182521, 12.6767...|
|   471|[[182521, 15.2072...|
|   496|[[182521, 40.2692...|
|   833|[[182521, 17.0266...|
|  1088|[[182521, 18.0577...|
|  1238|[[114891, 9.96217...|
|  1342|[[95159, 12.43337...|
|  1580|[[182521, 15.6196...|
|  1591|[[182521, 21.7430...|
|  1645|[[182521, 21.8107...|
|  1829|[[182521, 25.2009...|
|  1959|[[182521, 23.6266...|
|  2122|[[182521, 16.3207...|
|  2142|[[134340, 19.3434...|
|  2366|[[182521, 27.4708...|
|  2659|[[182521, 22.9913...|
|  2866|[[182521, 16.3426...|
|  3175|[[182521, 13.0625...|
|  3749|[[182521, 26.1780...|
+------+--------------------+
only showing top 20 rows



In [11]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|    148|[[62554, 8.840129...|
|    463|[[91645, 8.775639...|
|    471|[[263003, 7.05658...|
|    496|[[87053, 10.37394...|
|    833|[[45743, 7.743263...|
|   1088|[[83527, 7.717355...|
|   1238|[[216879, 7.08419...|
|   1342|[[196206, 7.81074...|
|   1580|[[213167, 6.26365...|
|   1591|[[195814, 7.06737...|
|   1645|[[45577, 7.551851...|
|   1829|[[7415, 7.5539584...|
|   1959|[[41155, 7.072745...|
|   2122|[[68522, 7.945891...|
|   2142|[[169701, 8.28001...|
|   2366|[[22152, 9.07409]...|
|   2659|[[99385, 8.08814]...|
|   2866|[[35635, 6.776875...|
|   3175|[[179827, 7.31754...|
|   3749|[[85393, 11.91140...|
+-------+--------------------+
only showing top 20 rows

