## Cumulative Hit Rate

In [None]:
import random
import findspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import rank, col, udf
from pyspark.ml.evaluation import RegressionEvaluator

findspark.init()
spark = SparkSession.builder.appName('TestRecommender')\
                    .config('spark.executors.cores', 4)\
                    .getOrCreate()
ratings = spark.read.option('header', 'true').option("inferSchema", "true").csv('ml-latest-small/ratings.csv')

In [None]:
cutOff = 2.5
hitCount = 0

def createSeed(x):
    return random.randint(1,21)*x

randomSeed=udf(lambda x: createSeed(x), IntegerType())

for x in range(1, 20):
    window = Window.partitionBy(ratings['userId']).orderBy(randomSeed('timestamp'))
    onlyBestRatings = ratings.where('rating > 4.5')
    leftOutTestDataset = onlyBestRatings.select('*', F.rank().over(window).alias('rank'))\
                                        .where((F.col('rank') <=1) )\
                                        .orderBy('userId').drop('rank').cache()

    trainingDataset = ratings.subtract(leftOutTestDataset).cache()

    als = ALS(maxIter=20, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')
    model = als.fit(trainingDataset)

    leftOutPredictions = model.transform(leftOutTestDataset)

    userRecs = model.recommendForAllUsers(30)
    topNRecommendations = userRecs.select('userId', F.posexplode('recommendations'))\
                                  .select('userId',
                                          F.col('col.movieId').alias('movieId'),
                                          F.col('col.rating').alias('prediction'),
                                          (F.col('pos') + 1).alias('rank'))

    hitRecommendations = topNRecommendations.alias('a')\
                                            .join(leftOutPredictions.alias('b'),
                                                  (F.col('a.userId') == F.col('b.userId')) &
                                                    (F.col('a.movieId') == F.col('b.movieId')))\
                                            .select('a.userId', 'a.movieId', 'b.prediction', 'rating', 'rank')
    
    hitRecommendationsWithCutOff = hitRecommendations.where(F.col('prediction') > cutOff)

    count = hitRecommendations.count()
    hitCount += count
    print("**** Hit count for matches higher than rating prediction {} in iteration {} is {}, Now Total Hits are {}".format(cutOff, str(x), str(count),str(hitCount)))
    
    hitRecommendations.unpersist()
    topNRecommendations.unpersist()
    userRecs.unpersist()
    trainingDataset.unpersist()
    leftOutTestDataset.unpersist()
    leftOutPredictions.unpersist()
    hitRecommendationsWithCutOff.unpersist()
    

In [None]:
cumulativeHitRate17 = 9/671
print('Cumulative Hit Rate For Iteration 17: ' + str(cumulativeHitRate17))

#### Example Response