In [1]:
import math
import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.mllib.recommendation import ALS


# create spark session
# use local[7] on ec2 instance
spark = ps.sql.SparkSession.builder \
          .master('local[4]')  \
          .appName('rpg_rec') \
          .getOrCreate() \

In [2]:
# load in the ratings data
df = spark.read.csv('../data/ratings.csv',
                       header=False,
                       sep='|',
                       inferSchema=True)

In [3]:
# convert to rdd...
data_rdd = df.rdd

# tarin/test sets...
training_RDD, validation_RDD, test_RDD = data_rdd.randomSplit([6, 2, 2])
# training_RDD, test_RDD = data_rdd.randomSplit([7, 3])
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [4]:
# run a recommender...
# Parameters
seed = 5L
iterations = 10
regularization_parameter = 0.1
ranks = [a for a in xrange(4,30,2)]
errors = [0 for a in ranks]
err = 0
tolerance = 0.02
rank = 1


In [5]:
min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    last = ''
    try:
        last = 'model'
        model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)
        last = 'predicions'
        predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
        last = 'rates & predicions'
        rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        last = 'errors'
        error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        errors[err] = error
        err += 1
        print 'For rank %s the RMSE is %s' % (rank, error)
        if error < min_error:
            min_error = error
            best_rank = rank
    except:
        print 'error at {} with rank {}'.format(last, rank)

print 'The best model was trained with rank %s' % best_rank

rank 4 model 
 rank 4 predictions 
 rank 4 errors 
For rank 4 the RMSE is 2.43119483241
rank 6 model 
 rank 6 predictions 
 rank 6 errors 
For rank 6 the RMSE is 2.29656219383
rank 8 model 
 rank 8 predictions 
 rank 8 errors 
For rank 8 the RMSE is 2.28721156374
rank 10 model 
 rank 10 predictions 
 rank 10 errors 
For rank 10 the RMSE is 2.26013621162
rank 12 model 
 rank 12 predictions 
 rank 12 errors 
For rank 12 the RMSE is 2.27579829702
rank 14 model 
 rank 14 predictions 
 rank 14 errors 
For rank 14 the RMSE is 2.27216984801
rank 16 model 
 rank 16 predictions 
 rank 16 errors 
For rank 16 the RMSE is 2.27838878344
rank 18 model 
 rank 18 predictions 
 rank 18 errors 
For rank 18 the RMSE is 2.27564147915
rank 20 model 
 rank 20 predictions 
 rank 20 errors 
For rank 20 the RMSE is 2.25561098541
rank 22 model 
 rank 22 predictions 
 rank 22 errors 
For rank 22 the RMSE is 2.26517728079
rank 24 model 
 rank 24 predictions 
 rank 24 errors 
For rank 24 the RMSE is 2.26543536842


In [6]:
rates_and_preds.take(3)

[((327227, 0), (3.0, 0.6254855446343139)),
 ((148605, 113), (1.0, 0.6525971143271057)),
 ((428735, 72), (1.0, 0.6238654492917837))]

In [7]:
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print 'For testing data the RMSE is %s' % (error)

For testing data the RMSE is 2.10823416076


In [8]:
print "There are %s recommendations in the complete dataset" % (data_rdd.count())

There are 17688 recommendations in the complete dataset


In [9]:
rates_and_preds.take(3)

[((3390, 105), (1.0, 0.4579704776990512)),
 ((153821, 83), (1.0, 1.1353818513016556)),
 ((84527, 136), (1.0, 0.4902495764389966))]