# RPG Recommender with Spark

This is based on [Building a Movie Recommender](https://www.codementor.io/jadianes/building-a-recommender-with-apache-spark-python-example-app-part1-du1083qbw), but with data from [Drive Thru RPG](www.drivethrurpg.com).  The fact that this runs from my laptop shows me that the 17688 data points I have does NOT require Spark to make a recommender, and I may build one in more standard Python packages as well (Scikit Learn, etc.).

In [1]:
import math
import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.mllib.recommendation import ALS


# create spark session
# use local[7] on ec2 instance
spark = ps.sql.SparkSession.builder \
          .master('local[3]')  \
          .appName('rpg_rec') \
          .getOrCreate() \

In [2]:
# load in the ratings data
df = spark.read.csv('../data/ratings.csv',
                       header=False,
                       sep='|',
                       inferSchema=True)

In [3]:
# convert to rdd...
data_rdd = df.rdd

# tarin/test sets...
training_RDD, validation_RDD, test_RDD = data_rdd.randomSplit([6, 2, 2])
# training_RDD, test_RDD = data_rdd.randomSplit([7, 3])
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [6]:
# run a recommender...
# Parameters
seed = 5L
iterations = 10
regularization_parameter = 0.1  # this likely needs to change...
ranks = [a for a in xrange(22,27)]
errors = [0 for a in ranks]
err = 0
tolerance = 0.02

# keep track of places where this failed
rank_fails = []

In [7]:
min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    last = ''
    try:
        last = 'model'
        model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)
        last = 'predicions'
        predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
        last = 'rates & predicions'
        rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        last = 'errors'
        error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        errors[err] = error
        err += 1
        if error < min_error:
            min_error = error
            best_rank = rank
    except:
        rank_fails.append((rank, last))
    print 'Progress: {:.1f}% \r'.format(100.0 * ranks.index(rank)/float(len(ranks))),
print 'The best model was trained with rank %s' % best_rank

The best model was trained with rank 22


In [9]:
rates_and_preds.take(9)

[((137013, 25), (1.0, 0.3238272816323266)),
 ((380085, 29), (1.0, 0.4795896499310039)),
 ((112933, 113), (1.0, 0.6330697347432238)),
 ((29941, 61), (8.0, 1.2302929591526754)),
 ((28129, 105), (2.0, 0.764147055699367)),
 ((158594, 24), (5.0, 0.6748492360319994)),
 ((127362, 88), (1.0, 0.992100461143115)),
 ((181390, 176), (1.0, 0.6513531633433379)),
 ((265518, 0), (1.0, 0.37992156807926386))]

In [10]:
training_RDD, test_RDD = data_rdd.randomSplit([7, 3], seed=0L)

complete_model = ALS.train(training_RDD, best_rank, seed=seed, 
                           iterations=iterations, lambda_=regularization_parameter)

In [11]:
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

predictions = complete_model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print 'For testing data the RMSE is %s' % (error)

For testing data the RMSE is 2.22184651462


In [12]:
rates_and_preds.take(9)

[((9194, 8), (2.0, 0.6468061084298773)),
 ((99916, 162), (1.0, 0.5061060298303088)),
 ((407699, 59), (1.0, 0.42239765038942473)),
 ((112933, 113), (1.0, 2.230568132274757)),
 ((314351, 95), (1.0, 1.179425728776017)),
 ((19948, 22), (1.0, 0.2815130736568328)),
 ((464079, 59), (1.0, 1.0729447496908076)),
 ((20849, 81), (1.0, 0.5638028917598142)),
 ((718572, 2), (1.0, 0.9542448014437668))]