In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('rec').getOrCreate()

25/01/10 08:12:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
from pyspark.ml.recommendation import ALS

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

In [7]:
data = spark.read.csv('movielens_ratings.csv', inferSchema=True, header=True)

                                                                                

In [8]:
data.head(1)

[Row(movieId=2, rating=3.0, userId=0)]

In [13]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [22]:
training, test = data.randomSplit([0.8,0.2])

In [23]:
als = ALS(maxIter= 5, userCol='userId', itemCol='movieId', ratingCol='rating')

In [24]:
model = als.fit(training)

In [25]:
predictions = model.transform(test)

In [27]:
predictions.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|      2|   4.0|    28| 1.2507821|
|      7|   1.0|    28| 2.1080952|
|      1|   1.0|    26|0.47395125|
|      3|   1.0|    26| 1.1830662|
|      4|   1.0|    12| 1.3890378|
|      3|   2.0|    22| 1.1688485|
|      1|   1.0|     6| 0.9500195|
|      4|   1.0|     5| 1.8740482|
|      2|   3.0|     9| 2.0168698|
|      3|   1.0|    17|   1.26727|
|      4|   1.0|     9| 1.5125542|
|      0|   1.0|     8| 1.4125558|
|      7|   1.0|     8|0.99924356|
|      2|   4.0|    10| 1.5855509|
|      2|   1.0|    25|0.98678195|
|      7|   1.0|    25| 1.1938453|
|      7|   1.0|    24| 1.9656888|
|      4|   1.0|    29| 1.6397057|
|      2|   4.0|    21| 2.3725717|
|      6|   2.0|    11| 2.0527475|
+-------+------+------+----------+
only showing top 20 rows



In [30]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [31]:
rmse = evaluator.evaluate(predictions)

In [32]:
print('RMSE')
print(rmse)

RMSE
1.0328213961070793


In [34]:
single_user = test.filter(test['userId']== 11).select(['movieId','userId'])

In [35]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      6|    11|
|     12|    11|
|     19|    11|
|     23|    11|
|     35|    11|
|     39|    11|
|     67|    11|
|     71|    11|
|     78|    11|
|     88|    11|
|     94|    11|
+-------+------+



In [36]:
recomendation = model.transform(single_user)

In [38]:
recomendation.orderBy('prediction',ascending= False).show()



+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     39|    11|  3.057804|
|     23|    11| 2.9321082|
|     19|    11| 2.7986448|
|     94|    11| 2.5764356|
|     35|    11| 2.5759428|
|     88|    11| 2.1090224|
|      6|    11| 2.0527475|
|     71|    11| 1.9554276|
|     67|    11| 1.1501582|
|     78|    11| 1.0625054|
|     12|    11| 0.7853823|
+-------+------+----------+

