## Project Setup

### Dependencies

In [1]:
import random
import findspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import rank, col, udf
from pyspark.ml.evaluation import RegressionEvaluator

### Spark Setup

In [2]:
findspark.init()
spark = SparkSession.builder.appName('TestRecommender').getOrCreate()

### Ratings dataframe

In [3]:
ratings = spark.read.option('header', 'true').option("inferSchema", "true").csv('ml-latest-small/ratings.csv')
ratingsCount = ratings.count()
print("Total ratings: " + str(ratingsCount))
ratings.printSchema()
ratings.show()

Total ratings: 100004
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+

### Training, Test Dataset creation
Training: 75%
Test: 25%

In [4]:
(trainingDataset, testDataset) = ratings.randomSplit([0.75, 0.25])
trainingCount = trainingDataset.count()
testCount = testDataset.count()
print("Training row count: " + str(trainingCount))
print("Test row count: " + str(testCount))

Training row count: 75307
Test row count: 24697


### Training the model and create Top 10 recommendations

In [5]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')
model = als.fit(trainingDataset)
userRecs = model.recommendForAllUsers(10)
userRecs.printSchema()
userRecs.show(20)

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movieId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[7063, 5.815272]...|
|   463|[[946, 5.2240906]...|
|   496|[[69526, 7.290323...|
|   148|[[67255, 6.755663...|
|   540|[[1162, 11.47817]...|
|   392|[[2730, 7.760437]...|
|   243|[[443, 5.37547], ...|
|   623|[[6773, 6.191035]...|
|    31|[[3653, 5.818799]...|
|   516|[[72641, 5.737064...|
|   580|[[6773, 5.3468013...|
|   251|[[3653, 7.0004644...|
|   451|[[299, 8.640146],...|
|    85|[[40629, 11.98269...|
|   137|[[1603, 8.688451]...|
|    65|[[2071, 10.931352...|
|   458|[[2071, 6.248592]...|
|   481|[[1085, 6.3718367...|
|    53|[[4062, 8.568881]...|
|   255|[[6773, 5.810036]...|
+------+--------------------+
only showing top 20 rows



### Display recommendations

In [6]:
topNRecommendations = userRecs.select('userId', F.posexplode('recommendations'))\
                              .select('userId',
                                      F.col('col.movieId').alias('movieId'),
                                      F.col('col.rating').alias('prediction'),
                                      (F.col('pos') + 1).alias('rank'))

topNRecommendations.show()

+------+-------+----------+----+
|userId|movieId|prediction|rank|
+------+-------+----------+----+
|   471|   7063|  5.815272|   1|
|   471|   2071|   5.41364|   2|
|   471|  81229|  5.396172|   3|
|   471|   5146|  5.371749|   4|
|   471|   5603|  5.342844|   5|
|   471|   4225| 5.3205905|   6|
|   471|  83318|  5.313382|   7|
|   471|  83359|  5.313382|   8|
|   471|   3478| 5.2698746|   9|
|   471|    314| 5.2604284|  10|
|   463|    946| 5.2240906|   1|
|   463|   3468| 5.2091074|   2|
|   463|   1361|  5.130007|   3|
|   463|  67255| 5.1180673|   4|
|   463|   3357|  5.088714|   5|
|   463|    994|  5.031719|   6|
|   463|   2132| 4.9852486|   7|
|   463|  58303| 4.9835353|   8|
|   463|   1172|  4.966971|   9|
|   463|   3265| 4.9506445|  10|
+------+-------+----------+----+
only showing top 20 rows



### Filter and show recommendations for user 85

In [7]:
user85Recs = userRecs.filter(userRecs['userId'] == 85).first()
movies = spark.read.option('header', 'true').option("inferSchema", "true").csv('ml-latest-small/movies.csv').toPandas().set_index('movieId').T.to_dict('list')
for recommendation in user85Recs.recommendations:
    print(movies[recommendation['movieId']])

['Pride & Prejudice (2005)', 'Drama|Romance']
['Love Is a Many-Splendored Thing (1955)', 'Drama|Romance|War']
['Blind Side, The  (2009)', 'Drama']
['And the Band Played On (1993)', 'Drama']
['Last Temptation of Christ, The (1988)', 'Drama']
['Help, The (2011)', 'Drama']
['Bamba, La (1987)', 'Drama']
['Queen of the Damned (2002)', 'Fantasy|Horror']
['Aeon Flux (2005)', 'Action|Sci-Fi']
['Dead Man (1995)', 'Drama|Mystery|Western']


## Recommender Accuracy Metrics

### Get predictions for test dataset

In [10]:
testRatingsAndPredictions = model.transform(testDataset)
print('----TEST DATASET----')
testDataset.show()
print('----PREDICTIONS FOR TEST DATASET----')
testRatingsAndPredictions.orderBy('userId').show()


----TEST DATASET----
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|   1172|   4.0|1260759205|
|     1|   1339|   3.5|1260759125|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   2294|   2.0|1260759108|
|     2|     52|   3.0| 835356031|
|     2|     62|   3.0| 835355749|
|     2|    144|   3.0| 835356016|
|     2|    165|   3.0| 835355441|
|     2|    208|   3.0| 835355511|
|     2|    222|   5.0| 835355840|
|     2|    235|   3.0| 835355664|
|     2|    248|   3.0| 835355896|
|     2|    265|   5.0| 835355697|
|     2|    273|   4.0| 835355779|
|     2|    300|   3.0| 835355532|
|     2|    314|   4.0| 835356044|
|     2|    319|   1.0| 835355918|
|     2|    350|   4.0| 835355697|
|     2|    364|   3.0| 835355604|
+------+-------+------+----------+
only showing top 20 rows

----PREDICTIONS FOR TEST DATASET----
+------+-------+------+----------+----------+
|userId|movieId|rating| timest

### MAE, RMSE

In [11]:
maeEval = RegressionEvaluator(metricName='mae', labelCol='rating', predictionCol='prediction')
rmseEval = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
mae = maeEval.evaluate(testRatingsAndPredictions)
rmse = rmseEval.evaluate(testRatingsAndPredictions)
print('MAE: ' + str(mae))
print('RMSE: '+ str(rmse))

MAE: 0.8455554777050429
RMSE: 1.1260865953551071
