In [1]:
from pyspark.mllib.recommendation import ALS
from pyspark.mllib.recommendation import Rating

In [2]:
# Read in the ratings file (fromUserId, toUserId, rating).  These ratings are 0-9.
rawRatingsRDD = sc.textFile("s3n://insight-spark-after-dark/ratings.csv.gz")
rawRatingsRDD.take(10)

[u'1,133,8',
 u'1,720,6',
 u'1,971,10',
 u'1,1095,7',
 u'1,1616,10',
 u'1,1978,7',
 u'1,2145,8',
 u'1,2211,8',
 u'1,3751,7',
 u'1,4062,3']

In [None]:
rawRatingsRDD.count()

In [3]:
# Create mllib.recommendation.Rating RDD from raw ratings input data
def rec_tup(row):
    tokens = row.split(",")
    return Rating(int(tokens[0]), int(tokens[1]), int(tokens[2]))

ratingsRDD = rawRatingsRDD.map(rec_tup)

In [4]:
# Separate ratings data into training data (80%) and test data (20%)
splitRatingsRDD = ratingsRDD.randomSplit([0.8, 0.2])
trainingRatingsRDD = splitRatingsRDD[0]
knownTestRatingsRDD = splitRatingsRDD[1]

In [5]:
# Train the ALS model using the training data and various model hyperparameters
model = ALS.train(trainingRatingsRDD, 1, 5, 0.01, 10)

In [6]:
# Compare predictions against the known test data
def rec_tup(rating_row):    
    return (rating_row.user, rating_row.product)

#knownTestFromToRDD = knownTestRatingsRDD.map(lambda r: (r[0], r[1]))
knownTestFromToRDD = knownTestRatingsRDD.map(rec_tup)

In [7]:
# Test the model by predicting the ratings for the known test data
actualPredictionsRDD = model.predictAll(knownTestFromToRDD)

actualPredictionsRDD.take(10)

[Rating(user=116685, product=193370, rating=5.528812041031813),
 Rating(user=54883, product=108150, rating=8.683123112274984),
 Rating(user=33416, product=108150, rating=6.631991761983102),
 Rating(user=62422, product=108150, rating=8.583980144995792),
 Rating(user=55650, product=28730, rating=7.393950083170125),
 Rating(user=63293, product=28730, rating=7.613931757845535),
 Rating(user=127227, product=28730, rating=7.507604533863969),
 Rating(user=41675, product=28730, rating=6.404783041859787),
 Rating(user=99848, product=28730, rating=8.706459374215797),
 Rating(user=128881, product=18500, rating=9.486194904775857)]

In [11]:
# Prepare the known test predictions and actual predictions for comparison
def rec_tup(rating_row):
    return ((rating_row[0], rating_row[1]), rating_row[2])

actualPredictionsKeyedByFromToRDD = actualPredictionsRDD.map(rec_tup)
testPredictionsKeyedByFromToRDD = knownTestRatingsRDD.map(rec_tup)

In [12]:
# Join the known test predictions with the actual predictions
testRatingsAndActualPredictionsJoinedRDD = testPredictionsKeyedByFromToRDD.join(actualPredictionsKeyedByFromToRDD)
testRatingsAndActualPredictionsJoinedRDD.take(10)

[((81423, 5113), (2, 4.3594495368705)),
 ((89913, 83773), (6, 6.930871660005323)),
 ((52884, 22327), (7, 6.965454999332394)),
 ((63502, 190866), (10, 9.295794187756655)),
 ((85983, 168324), (7, 4.430503718902514)),
 ((108613, 210193), (1, 2.274421179760509)),
 ((95095, 10941), (5, 2.0409082294914214)),
 ((29058, 71636), (10, 7.065374332092119)),
 ((52502, 118322), (10, 8.511748018306434)),
 ((15180, 70961), (3, 3.2493501618394305))]

In [None]:
testPredictionsKeyedByFromToRDD.count()

In [13]:
# Evaluate the model using Mean Absolute Error (MAE) between the known test ratings and the actual predictions 
def rec_tup(record):
    return abs(record[1][0] - record[1][1])

meanAbsoluteError = testRatingsAndActualPredictionsJoinedRDD.map(rec_tup).mean()

print meanAbsoluteError

2.7117627896
