## Rating Hit Rate

In [1]:
import random
import findspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import rank, col, udf
from pyspark.ml.evaluation import RegressionEvaluator

findspark.init()
spark = SparkSession.builder.appName('TestRecommender')\
                    .config('spark.executors.cores', 4)\
                    .getOrCreate()
ratings = spark.read.option('header', 'true').option("inferSchema", "true").csv('ml-latest-small/ratings.csv')

In [2]:
hitCount = 0

def createSeed(x):
    return random.randint(1,21)*x

randomSeed=udf(lambda x: createSeed(x), IntegerType())

window = Window.partitionBy(ratings['userId']).orderBy(randomSeed('timestamp'))
onlyBestRatings = ratings.where('rating > 3.5')

leftOutTestDataset = onlyBestRatings.select('*', F.rank().over(window).alias('rank'))\
                                    .where((F.col('rank') <=1) )\
                                    .orderBy('userId').drop('rank').cache()

trainingDataset = ratings.subtract(leftOutTestDataset).cache()

als = ALS(maxIter=20, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')
model = als.fit(trainingDataset)

leftOutPredictions = model.transform(leftOutTestDataset)

userRecs = model.recommendForAllUsers(30)
topNRecommendations = userRecs.select('userId', F.posexplode('recommendations'))\
                              .select('userId',
                                       F.col('col.movieId').alias('movieId'),
                                       F.col('col.rating').alias('prediction'),
                                       (F.col('pos') + 1).alias('rank'))

hitRecommendations = topNRecommendations.alias('a')\
                                        .join(leftOutPredictions.alias('b'),\
                                              (F.col('a.userId') == F.col('b.userId')) &
                                              (F.col('a.movieId') == F.col('b.movieId')))\
                                        .select('a.userId', 'a.movieId', 'b.prediction', 'rating', 'rank')


In [3]:
testsGroupedByRating = leftOutPredictions.groupBy('rating').agg(F.count(F.lit(1)).alias("total_left_out"))
testsGroupedByRating.show()

+------+--------------+
|rating|total_left_out|
+------+--------------+
|   4.5|            82|
|   4.0|           381|
|   5.0|           219|
+------+--------------+



In [4]:
hitsGroupedByRating = hitRecommendations.groupBy('rating').agg(F.count(F.lit(1)).alias("total_hits"))
hitsGroupedByRating.show()

+------+----------+
|rating|total_hits|
+------+----------+
|   4.5|         2|
|   4.0|         1|
|   5.0|         2|
+------+----------+



In [17]:
ratingHitRate = testsGroupedByRating.alias('a')\
                                    .join(hitsGroupedByRating.alias('b'), (F.col('a.rating') == F.col('b.rating')))\
                                    .select('a.rating',\
                                            'b.total_hits',\
                                            'a.total_left_out',\
                                            (F.col('b.total_hits')/F.col('a.total_left_out')).alias('rating_hit_rate'))
                                          
ratingHitRate.show()

+------+----------+--------------+--------------------+
|rating|total_hits|total_left_out|     rating_hit_rate|
+------+----------+--------------+--------------------+
|   4.5|         2|            82|0.024390243902439025|
|   4.0|         1|           381|0.002624671916010...|
|   5.0|         2|           219|  0.0091324200913242|
+------+----------+--------------+--------------------+

