### Createing SpackSession

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recommendation_System').getOrCreate()

### Importing the Dataset

In [6]:
df = spark.read.csv('./data/12_movielens_ratings.csv',inferSchema=True,header=True)

In [7]:
df.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [8]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



### Splitting the dataset

In [9]:
training_set , test_set = df.randomSplit([0.8,0.2])

### Createing the model

In [13]:
from pyspark.ml.recommendation import ALS
recommender = ALS(userCol= 'userId', itemCol= 'movieId', ratingCol='rating')
recommender = recommender.fit(training_set)

### Predictiong using the test set

In [15]:
preds = recommender.transform(test_set)

In [16]:
preds.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|      3|   1.0|    26| 1.1706842|
|      2|   1.0|    12| 0.8300074|
|      6|   1.0|    12|  1.082421|
|      0|   1.0|    13|   0.65486|
|      4|   2.0|    13| 1.8417549|
|      5|   1.0|    13|0.68662375|
|      0|   1.0|     6| 1.2873976|
|      2|   1.0|    16| 1.5040722|
|      0|   1.0|     3|  0.645666|
|      6|   1.0|    20| 1.3907601|
|      1|   1.0|     5| 2.1129293|
|      5|   1.0|     5| 1.3788033|
|      6|   2.0|    19| 1.1254199|
|      3|   1.0|    17|0.98663855|
|      3|   2.0|     8| 1.7552651|
|      2|   4.0|    10| 2.0877345|
|      4|   3.0|    10| 0.9369778|
|      4|   1.0|    24| 1.3532029|
|      4|   1.0|    29| 1.3602777|
|      6|   2.0|    11|  2.126032|
+-------+------+------+----------+
only showing top 20 rows



### Evaluationg the model

In [18]:
# root mean squared error = sqrt(sum(1,n) (|predicted - actual|)^2)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='rating')
evaluator.evaluate(preds)

1.0346734185584492

### Making a recommendation

In [19]:
test_set.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      0|   1.0|     3|
|      0|   1.0|     6|
|      0|   1.0|    13|
|      1|   1.0|     5|
|      2|   1.0|    12|
|      2|   1.0|    16|
|      2|   4.0|    10|
|      3|   1.0|    17|
|      3|   1.0|    26|
|      3|   2.0|     8|
|      4|   1.0|    24|
|      4|   1.0|    29|
|      4|   2.0|    13|
|      4|   3.0|    10|
|      5|   1.0|     5|
|      5|   1.0|    13|
|      5|   2.0|     0|
|      6|   1.0|    12|
|      6|   1.0|    20|
|      6|   2.0|    11|
+-------+------+------+
only showing top 20 rows



In [23]:
test_set.filter(test_set['userId'] == 23).show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|     14|   1.0|    23|
|     24|   1.0|    23|
|     25|   1.0|    23|
|     48|   5.0|    23|
|     64|   4.0|    23|
|     72|   1.0|    23|
|     77|   1.0|    23|
|     84|   1.0|    23|
|     87|   3.0|    23|
+-------+------+------+



In [27]:
single_user = test_set.filter(test_set['userId']==23)

In [28]:
single_user.show()

+------+-------+
|userId|movieId|
+------+-------+
|    23|     14|
|    23|     24|
|    23|     25|
|    23|     48|
|    23|     64|
|    23|     72|
|    23|     77|
|    23|     84|
|    23|     87|
+------+-------+



In [30]:
recommendation = recommender.transform(single_user)

In [31]:
recommendation.orderBy('prediction',ascending = False).show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    23|     64| 3.4092183|
|    23|     48| 2.3060563|
|    23|     87|  2.145935|
|    23|     77| 1.6267176|
|    23|     72| 1.6068959|
|    23|     14| 1.4555113|
|    23|     24| 1.3340608|
|    23|     84|0.80534863|
|    23|     25|0.19607179|
+------+-------+----------+

