In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Reco System').getOrCreate()

In [2]:
df = spark.read.csv('/home/ubuntu/Course_Notes/Spark_for_Machine_Learning/Recommender_Systems/movielens_ratings.csv',inferSchema= True, header = True)

In [3]:
df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [6]:
train,test = df.randomSplit([0.8,0.2])

In [7]:
from pyspark.ml.recommendation import ALS

In [11]:
als = ALS(maxIter=5,regParam=0.01,userCol= 'userId',itemCol= 'movieId',ratingCol='rating')

In [12]:
model = als.fit(train)

In [13]:
preds = model.transform(test)

In [14]:
preds.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    29| 1.9836011|
|     31|   1.0|     0| 0.9678407|
|     31|   1.0|    18|  4.013491|
|     85|   1.0|    12|  1.971451|
|     85|   3.0|     1| 1.0121034|
|     85|   1.0|    13| 3.1084843|
|     85|   5.0|    16|-0.7535914|
|     65|   1.0|    28| 1.3498251|
|     65|   1.0|    16|-2.9281168|
|     65|   1.0|    19| 1.1523376|
|     65|   1.0|     4|  1.236587|
|     65|   1.0|    24|-2.7423232|
|     65|   1.0|     2| 1.9369289|
|     53|   1.0|     9| 3.3617458|
|     53|   5.0|     8| 3.4970675|
|     53|   1.0|     7|  4.167491|
|     53|   3.0|    14| 1.8687303|
|     78|   1.0|    24| 0.3218385|
|     78|   1.0|     2| 1.7553501|
|     34|   3.0|     3| 1.1053158|
+-------+------+------+----------+
only showing top 20 rows



In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

In [22]:
evaluator = RegressionEvaluator(labelCol='rating',predictionCol='prediction',metricName='rmse')

In [23]:
result = evaluator.evaluate(preds)

In [24]:
result

1.8361818136082781

In [30]:
single_user = test.filter(test['userId'] == 11).select(['movieId','userId'])

In [31]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     18|    11|
|     25|    11|
|     27|    11|
|     30|    11|
|     35|    11|
|     39|    11|
|     45|    11|
|     51|    11|
|     86|    11|
+-------+------+



In [35]:
model.transform(single_user).orderBy('prediction',ascending = False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     51|    11|  4.849427|
|     39|    11| 4.2310934|
|     18|    11| 3.3515453|
|     30|    11| 3.0112529|
|     45|    11| 2.9636347|
|     35|    11| 1.6733391|
|     86|    11| 1.4062023|
|     27|    11| 1.0328393|
|     25|    11| 1.0035653|
+-------+------+----------+

