## Project Setup

### Dependencies

In [1]:
import findspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession 
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

### Spark Setup

In [2]:
findspark.init()
spark = SparkSession.builder.appName('TestRecommender').getOrCreate()

### Ratings dataframe

In [6]:
ratings = spark.read.option('header', 'true').option("inferSchema", "true").csv('ml-latest-small/ratings.csv')
ratingsCount = ratings.count()
print("Total ratings: " + str(ratingsCount))
ratings.printSchema()
ratings.show()

Total ratings: 100004
root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+

### Training, Test Dataset creation
Training: 75%
Test: 25%

In [8]:
(trainingDataset, testDataset) = ratings.randomSplit([0.75, 0.25])
trainingCount = trainingDataset.count()
testCount = testDataset.count()
print("Training row count: " + str(trainingCount))
print("Test row count: " + str(testCount))

Training row count: 74854
Test row count: 25150


### Training the model and create Top 10 recommendations

In [13]:
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')
model = als.fit(trainingDataset)
userRecs = model.recommendForAllUsers(10)
userRecs.printSchema()
userRecs.show(20)

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movieId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[1273, 5.9641657...|
|   463|[[83411, 5.69187]...|
|   496|[[5152, 7.9766097...|
|   148|[[26729, 5.463170...|
|   540|[[26729, 9.037238...|
|   392|[[1057, 8.550612]...|
|   243|[[3414, 5.6985326...|
|   623|[[94466, 6.304226...|
|    31|[[122904, 6.76062...|
|   516|[[3548, 5.847185]...|
|   580|[[1211, 4.9287505...|
|   251|[[3925, 7.8438787...|
|   451|[[5477, 8.435648]...|
|    85|[[2287, 9.49665],...|
|   137|[[3414, 8.761408]...|
|    65|[[7444, 8.162363]...|
|   458|[[1273, 5.980074]...|
|   481|[[1111, 6.1750154...|
|    53|[[71899, 8.89332]...|
|   255|[[2467, 6.5249333...|
+------+--------------------+
only showing top 20 rows



### Display recommendations

In [14]:
topNRecommendations = userRecs.withColumn('rec_array', F.explode('recommendations'))\
                              .withColumn('movieId', F.col('rec_array.movieId'))\
                              .withColumn('prediction', F.col('rec_array.rating'))\
                              .select('userId', 'movieId', 'prediction')

topNRecommendations.show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   471|   1273| 5.9641657|
|   471|   1354|   5.84264|
|   471|  94466| 5.8166423|
|   471|   3925|  5.805952|
|   471|   8638|  5.715417|
|   471|   2203|   5.70847|
|   471|   3260|  5.687847|
|   471|   3265| 5.6527867|
|   471|   3163|  5.642231|
|   471|   3414|   5.59465|
|   463|  83411|   5.69187|
|   463|  67504|   5.69187|
|   463|  83359|   5.69187|
|   463|   7063| 5.4603014|
|   463|   2318| 5.4062123|
|   463|   4914| 5.1894364|
|   463|  26712| 5.1789856|
|   463|   7771| 5.1621046|
|   463|   3633| 5.1548343|
|   463|  25769|  5.133811|
+------+-------+----------+
only showing top 20 rows



### Filter and show recommendations for user 85

In [15]:
user85Recs = userRecs.filter(userRecs['userId'] == 85).first()
movies = spark.read.option('header', 'true').option("inferSchema", "true").csv('ml-latest-small/movies.csv').toPandas().set_index('movieId').T.to_dict('list')
for recommendation in user85Recs.recommendations:
    print(movies[recommendation['movieId']])

['Them! (1954)', 'Horror|Sci-Fi|Thriller']
['Towering Inferno, The (1974)', 'Action|Adventure|Drama|Thriller']
['Amateur (1994)', 'Crime|Drama|Thriller']
['Clue (1985)', 'Comedy|Crime|Mystery|Thriller']
['Pillow Book, The (1996)', 'Drama|Romance']
['State and Main (2000)', 'Comedy|Drama']
['Hearts of Darkness: A Filmmakers Apocalypse (1991)', 'Documentary']
['G.I. Joe: The Rise of Cobra (2009)', 'Action|Adventure|Sci-Fi|Thriller']
['Paris Is Burning (1990)', 'Documentary']
['For Your Eyes Only (1981)', 'Action|Adventure|Thriller']


## Recommender Metrics

### Get predictions for test dataset

In [21]:
testRatingsAndPredictions = model.transform(testDataset)
testRatingsAndPredictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|    30|    463|   4.0| 945277405| 2.9271078|
|   311|    463|   3.0| 898008246| 2.4776416|
|   440|    471|   3.0| 835337519|  2.185658|
|   292|    471|   3.5|1140049920|  4.057743|
|   306|    471|   3.0| 939718996| 3.5492811|
|   607|    471|   4.0|1118247731| 3.6709409|
|   585|    471|   4.0| 975363578|  4.372205|
|   195|    471|   3.0| 976289176| 3.1663375|
|   487|    471|   4.0| 832837388| 4.2497973|
|   105|    471|   4.0|1085574088|  3.841051|
|   296|    833|   4.5|1298158960|  4.722431|
|   294|    833|   2.0|1047074195| 1.3547952|
|    52|   1088|   4.0|1231766626|  2.179169|
|   363|   1088|   2.0| 942345287| 3.2759418|
|    15|   1088|   2.0|1122576683| 2.7004256|
|   547|   1088|   5.0|1384462255| 2.4072495|
|   387|   1088|   4.0| 974790964| 3.9749198|
|    97|   1088|   2.0|1460342716| 1.0973232|
|   509|   1088|   2.0|1093295913|

### MAE, RMSE

In [28]:
maeEval = RegressionEvaluator(metricName='mae', labelCol='rating', predictionCol='prediction')
rmseEval = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
mae = maeEval.evaluate(testRatingsAndPredictions)
rmse = rmseEval.evaluate(testRatingsAndPredictions)
print('MAE: ' + str(mae))
print('RMSE: '+ str(rmse))

MAE: 0.8523705336377525
RMSE: 1.1318586171329739


### Top-N Recommendations evaluation Metrics

#### Hit Rate

In [31]:
hitRecommendations = topNRecommendations.alias('a')\
                                        .join(testDataset.alias('b'), (F.col('a.userId') == F.col('b.userId')) &
                                                                      (F.col('a.movieId') == F.col('b.movieId')))\
                                        .select('a.userId', 'a.movieId', 'prediction', 'rating')

hitRate = hitRecommendations.count()/testDataset.count()
print('HitRate: '+ str(hitRate))

HitRate: 0.0018687872763419482


#### Cumulative Hit Rate

In [33]:
ratingCutOff = 3.0
cumulativeHitRecommendations = hitRecommendations.filter(F.col('rating') >= ratingCutOff)
cumulativeHitRate = cumulativeHitRecommendations.count()/testDataset.count()
print('CumulativeHitRate: '+ str(cumulativeHitRate))

CumulativeHitRate: 0.0016699801192842942


#### RatingHitRate

In [38]:
hitsGroupedByRating = hitRecommendations.groupBy(['rating']).agg(F.count(F.lit(1)).alias("total_hits"))
testsGroupedByRating = testDataset.groupBy('rating').agg(F.count(F.lit(1)).alias("total_recs"))
        
histPerRating = hitsGroupedByRating.alias('a')\
                                   .join(testsGroupedByRating.alias('b'), F.col('a.rating') == F.col('b.rating'))\
                                   .select('a.rating', 'total_recs', 'total_hits')\
        
ratingHitRate = histPerRating.withColumn('hit_rate', F.col('total_hits') / F.col('total_recs'))\
                             .select('rating', 'hit_rate').sort(F.desc("rating"))
ratingHitRate.show()

+------+--------------------+
|rating|            hit_rate|
+------+--------------------+
|   5.0|0.002140754616002...|
|   4.5|0.002048131080389145|
|   4.0|0.002612042892493...|
|   3.5|0.001126126126126...|
|   3.0|0.001606103192130...|
|   2.5|8.849557522123894E-4|
|   2.0|0.001622498647917...|
|   1.0|0.001148105625717566|
+------+--------------------+



In [None]:
class RecommenderMetrics:
    
    def MAE(valuesAndPredictions, labelCol='rating', predictionCol='prediction'):
        mae = RegressionEvaluator(metricName='mae', labelCol=labelCol, predictionCol=predictionCol)
        return mae.evaluate(valuesAndPredictions)
    
    
    def RMSE(valuesAndPredictions, labelCol='rating', predictionCol='prediction'):
        rmse = RegressionEvaluator(metricName='rmse', labelCol=labelCol, predictionCol=predictionCol)
        return rmse.evaluate(valuesAndPredictions)
    
    
    def HitRate(topNPredicted, leftOutPredicted):
        hitRecommendations = topNPredicted.alias('a')\
                                          .join(leftOutPredicted.alias('b'), (F.col('a.userId') == F.col('b.userId')) &
                                                                             (F.col('a.movieId') == F.col('b.movieId')))\
                                          .select('a.userId', 'a.movieId', 'prediction', 'rating')
        
        return hitRecommendations.count()/leftOutPredicted.count()
    
    
    def CumulativeHitRate(topNPredicted, leftOutPredicted, ratingCutOff=0):
        hitRecommendations = topNPredicted.alias('a')\
                                          .join(leftOutPredicted.alias('b'), (F.col('a.userId') == F.col('b.userId')) &
                                                                             (F.col('a.movieId') == F.col('b.movieId')))\
                                          .select('a.userId', 'a.movieId', 'prediction', F.col('rating').cast('float'))
        
        cumulativeHitRecommendations = hitRecommendations.filter(F.col('rating') >= ratingCutOff)
        return cumulativeHitRecommendations.count()/leftOutPredicted.count()
    
    
    def RatingHitRate(topNPredicted, leftOutPredicted):
        hitRecommendations = topNPredicted.alias('a')\
                                          .join(leftOutPredicted.alias('b'), (F.col('a.userId') == F.col('b.userId')) &
                                                                             (F.col('a.movieId') == F.col('b.movieId')))\
                                          .select('a.userId', 'a.movieId', 'prediction', 'rating')
        
        hitsGroupedByRating = hitRecommendations.groupBy(['rating']).agg(F.count(F.lit(1)).alias("total_hits"))
        testsGroupedByRating = test.groupBy('rating').agg(F.count(F.lit(1)).alias("total_recs"))
        
        histPerRating = hitsGroupedByRating.alias('a')\
                                .join(testsGroupedByRating.alias('b'), F.col('a.rating') == F.col('b.rating'))\
                                .select('a.rating', 'total_recs', 'total_hits')\
        
        ratingHitRate = histPerRating.withColumn('hit_rate', F.col('total_hits') / F.col('total_recs'))\
                                     .select('rating', 'hit_rate')
                                 
        return ratingHitRate
    
    
    def AverageReciprocalHitrank(topNPredicted, leftOutPredicted):
        return 0
    
    
    def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
        return 0
    
    
    def Diversity(topNPredicted, simsAlgo):
        return 0
    
    
    def Novelty(topNPredicted, rankings):
        return 0
        