## Hit Rate

In [6]:
import random
import findspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import rank, col, udf
from pyspark.ml.evaluation import RegressionEvaluator

findspark.init()
spark = SparkSession.builder.appName('TestRecommender')\
                    .config('spark.executors.cores', 4)\
                    .getOrCreate()
ratings = spark.read.option('header', 'true').option("inferSchema", "true").csv('ml-latest-small/ratings.csv')

In [39]:
hitCount = 0

def createSeed(x):
    return random.randint(1,21)*x

randomSeed=udf(lambda x: createSeed(x), IntegerType())

for x in range(1, 20):
    window = Window.partitionBy(ratings['userId']).orderBy(randomSeed('timestamp'))
    
    onlyBestRatings = ratings.where('rating > 4.5')
    
    leftOutTestDataset = onlyBestRatings.select('*', F.rank().over(window).alias('rank'))\
                                        .where((F.col('rank') <=1) )\
                                        .orderBy('userId').drop('rank').cache()

    trainingDataset = ratings.subtract(leftOutTestDataset).cache()

    als = ALS(maxIter=20, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')
    model = als.fit(trainingDataset)
    userRecs = model.recommendForAllUsers(30)
    topNRecommendations = userRecs.select('userId', F.posexplode('recommendations'))\
                                  .select('userId',
                                          F.col('col.movieId').alias('movieId'),
                                          F.col('col.rating').alias('prediction'),
                                          (F.col('pos') + 1).alias('rank'))

    hitRecommendations = topNRecommendations.alias('a')\
                                            .join(leftOutTestDataset.alias('b'),
                                                  (F.col('a.userId') == F.col('b.userId')) &
                                                  (F.col('a.movieId') == F.col('b.movieId')))\
                                            .select('a.userId', 'a.movieId', 'prediction', 'rating', 'rank')

    count = hitRecommendations.count()
    hitCount += count
    print("**** Hit Count for iteration {} is {}, Now Total Hits are {}".format(str(x), str(count),str(hitCount)))
    
    hitRecommendations.unpersist()
    topNRecommendations.unpersist()
    userRecs.unpersist()
    trainingDataset.unpersist()
    leftOutTestDataset.unpersist()
    


**** Hit Count for iteration 1 is 2, Now Total Hits are 2
**** Hit Count for iteration 2 is 3, Now Total Hits are 5
**** Hit Count for iteration 3 is 3, Now Total Hits are 8
**** Hit Count for iteration 4 is 7, Now Total Hits are 15
**** Hit Count for iteration 5 is 4, Now Total Hits are 19
**** Hit Count for iteration 6 is 4, Now Total Hits are 23
**** Hit Count for iteration 7 is 5, Now Total Hits are 28
**** Hit Count for iteration 8 is 8, Now Total Hits are 36
**** Hit Count for iteration 9 is 2, Now Total Hits are 38
**** Hit Count for iteration 10 is 2, Now Total Hits are 40
**** Hit Count for iteration 11 is 5, Now Total Hits are 45
**** Hit Count for iteration 12 is 3, Now Total Hits are 48
**** Hit Count for iteration 13 is 6, Now Total Hits are 54
**** Hit Count for iteration 14 is 8, Now Total Hits are 62
**** Hit Count for iteration 15 is 3, Now Total Hits are 65
**** Hit Count for iteration 16 is 3, Now Total Hits are 68
**** Hit Count for iteration 17 is 2, Now Total Hits

In [44]:
hitRate18 = 10/671
print('Hit Rate For Iteration 19: ' + str(hitRate18))

Hit Rate For Iteration 19: 0.014903129657228018


#### Example Result: