### Createing SpackSession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recommendation_System').getOrCreate()

### Importing the Dataset

In [2]:
df = spark.read.csv('./data/12_movielens_ratings.csv',inferSchema=True,header=True)

In [3]:
df.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [4]:
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



### Splitting the dataset

In [5]:
training_set , test_set = df.randomSplit([0.8,0.2])

### Createing the model

In [6]:
from pyspark.ml.recommendation import ALS
recommender = ALS(userCol= 'userId', itemCol= 'movieId', ratingCol='rating')
recommender = recommender.fit(training_set)

### Predictiong using the test set

In [7]:
preds = recommender.transform(test_set)

In [8]:
preds.show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|      2|   1.0|    26| 2.2714884|
|      3|   1.0|    26| 1.4238312|
|      0|   1.0|    22|0.60962236|
|      5|   2.0|    22| 1.9389005|
|      6|   1.0|     1| 0.9734859|
|      2|   3.0|     6| 1.8363168|
|      6|   1.0|     6|  0.975117|
|      0|   1.0|     3|0.79794395|
|      0|   1.0|     5|0.95573896|
|      1|   1.0|     5| 2.3451767|
|      4|   1.0|    19| 1.4661646|
|      2|   1.0|    15| 1.4333471|
|      3|   1.0|    17|0.89547014|
|      4|   1.0|     9| 1.6242402|
|      2|   4.0|     8| 3.1553376|
|      3|   2.0|     8| 1.7767761|
|      4|   1.0|     7|   1.83481|
|      2|   1.0|    25| 1.7798879|
|      4|   1.0|    29| 1.3929825|
|      4|   3.0|     2| 2.3357182|
+-------+------+------+----------+
only showing top 20 rows



### Evaluationg the model

In [9]:
# root mean squared error = sqrt(sum(1,n) (|predicted - actual|)^2)
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='rating')
evaluator.evaluate(preds)

1.019557382262578

### Making a recommendation

In [10]:
test_set.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      0|   1.0|     3|
|      0|   1.0|     5|
|      0|   1.0|    22|
|      1|   1.0|     5|
|      2|   1.0|    15|
|      2|   1.0|    25|
|      2|   1.0|    26|
|      2|   3.0|     6|
|      2|   4.0|     8|
|      3|   1.0|    17|
|      3|   1.0|    26|
|      3|   2.0|     8|
|      4|   1.0|     7|
|      4|   1.0|     9|
|      4|   1.0|    19|
|      4|   1.0|    29|
|      4|   3.0|     2|
|      5|   2.0|     0|
|      5|   2.0|    22|
|      6|   1.0|     1|
+-------+------+------+
only showing top 20 rows



In [11]:
test_set.filter(test_set['userId'] == 23).show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|     13|   4.0|    23|
|     22|   2.0|    23|
|     49|   5.0|    23|
|     83|   1.0|    23|
|     87|   3.0|    23|
|     97|   1.0|    23|
+-------+------+------+



In [12]:
single_user = test_set.filter(test_set['userId']==23)

In [13]:
single_user.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|     13|   4.0|    23|
|     22|   2.0|    23|
|     49|   5.0|    23|
|     83|   1.0|    23|
|     87|   3.0|    23|
|     97|   1.0|    23|
+-------+------+------+



In [14]:
recommendation = recommender.transform(single_user)

In [15]:
recommendation.orderBy('prediction',ascending = False).show()

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     49|   5.0|    23|  3.795911|
|     13|   4.0|    23| 3.1566176|
|     22|   2.0|    23| 2.6542892|
|     97|   1.0|    23| 2.0540893|
|     87|   3.0|    23| 1.2983763|
|     83|   1.0|    23|  0.753503|
+-------+------+------+----------+

