# **Recommendation Engine**

localhost performance [link](http://localhost:4040/jobs/)

In [7]:
import os
import math
import numpy as np

from operator import add

from pyspark.mllib.recommendation import ALS
from pyspark.mllib.recommendation import Rating

from pyspark import SparkContext, SparkConf

from pyspark.sql import SQLContext

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def cosineSim(a,b):
    return 2.0*(1.0-np.dot(a,b.T)/np.linalg.norm(a)/np.linalg.norm(b))


class RecommendationEngine(object):
    """A collaborative filtering recommender engine
       http://spark.apache.org/docs/latest/mllib-collaborative-filtering.html
       http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS
    """
    def __init__(self, ratings_file, movies_file, rank, seed, iterations, regularization_parameter):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        self.sc = sc#SparkContext(appName="Engine2")

        ''' Load ratings data for later use '''
        rawRatings = self.sc.textFile(ratings_file,2)
        #rawRatings_header = rawRatings.take(1)[0]  ## no need if there are no headers.
        #self.parsedRatings = rawRatings.filter(lambda x: x!=rawRatings_header)\
        
        self.ratings_RDD =rawRatings.map(lambda line: line.split("::")).map(lambda x: Rating(int(x[0]),int(x[1]),float(x[2]))).cache()
        #self.ratings_RDD =rawRatings.map(lambda line: line.split("\t")).map(lambda x: Rating(int(x[0]),int(x[1]),float(x[2]))).cache()
            
         
        ''' Load movies data for later use '''
        rawMovies = self.sc.textFile(movies_file,2)
        #rawMovies_header = rawMovies.take(1)[0]
        #self.paredMovies_RDD = rawMovies.filter(lambda x: x!=rawMovies_header)\
        
        self.movies_RDD = rawMovies.map(lambda line: line.split("::")).map(lambda x: (int(x[0]),x[1])).cache()
        #self.movies_RDD = rawMovies.map(lambda line: line.split("|")).map(lambda x: (int(x[0]),x[1])).cache()
        print self.ratings_RDD.take(2)
        print self.movies_RDD.take(2)
             
        self.count_and_average_ratings()
        
        '''  separate the data to training, validation and test sets'''
        weights = [.6, .2, .2]
        seed = 42
        # Use randomSplit with weights and seed
        self.trainData, self.valData, self.testData = self.ratings_RDD.randomSplit(weights, seed)
        
 
        ''' Train the model and default parameters'''
        self.rank = rank
        self.seed = seed
        self.iterations = iterations
        self.regularization_parameter = regularization_parameter
        
        self.train_model() 
        print 'model is done'


    def count_and_average_ratings(self):
        """Updates the movies ratings counts from the current data self.ratings_RDD"""
        movie_ID_counts = dict(self.ratings_RDD.map(lambda x: (x[1], 1)).reduceByKey(add).collect())
        """note: self.ratings_RDD doesn't have complete movie list since not all movies are rated."""
        for movie, title in self.movies_RDD.collect():
                if movie not in movie_ID_counts: movie_ID_counts[movie] = 0

        self.movie_ID_avgRating_RDD = self.ratings_RDD.map(lambda x: (x[1], x[2]/movie_ID_counts[x[1]])).\
                    reduceByKey(add)
        
        self.movie_ID_counts = movie_ID_counts
 

    def train_model(self):
        """Train the ALS model with the current dataset
           http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS
        """
        print self.rank, self.seed, self.iterations, self.regularization_parameter
        #self.model = ALS.train(self.trainData, rank = self.rank, seed = self.seed,
        #                       iterations=self.iterations, lambda_= self.regularization_parameter)
        self.model_grid_search()
        
        
        
    def model_grid_search(self):
        '''the grid search on parameters for ALS model'''
        ranks = [4, 8, 12]
        errors = []
        min_error = float('inf')
        best_rank = -1
        best_reg = -1.0
        best_iteration = -1
        best_model = None
        '''here we want to do grid search'''
        X_val = self.valData.map(lambda x: (x[0], x[1]))
        for rank in ranks:
            for reg in [1e-3, 0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 1.0]:
                model = ALS.train(self.trainData, rank, seed=self.seed, iterations=self.iterations, lambda_=reg)
                predictions = model.predictAll(X_val).map(lambda r: ((r[0], r[1]), r[2]))
                rates_and_preds = self.valData.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
                error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
                errors.append(error)
                print 'For rank %s, reg %s, the RMSE is %s' % (rank, reg, error)
                if error < min_error:
                    min_error = error
                    best_model = model
                    best_rank = rank
                    best_reg = reg
        '''the grid search is done'''
        print 'The best model was trained with rank %s and reg %s' % (best_rank, best_reg)
        print 'min error is ', min_error
        self.rank = best_rank
        self.regularization_parameter = best_reg
        self.model = best_model
        
    
    def add_ratings(self, ratings):
        """Add additional movie ratings in the format (user_id, movie_id, rating)
        """
        newUser_ratings_RDD = self.sc.parallelize(ratings)               # Convert ratings to an RDD
        #self.ratings_RDD = self.ratings_RDD.union(newUser_ratings_RDD)
        self.trainData = self.trainData.union(newUser_ratings_RDD)       # Add new ratings to the existing ones
        # Re-compute movie ratings count
        self.count_and_average_ratings()
        # Re-train the ALS model with the new ratings
        self.train_model()
        print 'new model training is done'
        return newUser_ratings_RDD

    
    def ratings_accuracy(self, test_ratings_RDD):
        """Gets predictions for a given (userID, movieID) formatted RDD
        this is for checking the prediction accuracy
        """        
        #print test_ratings_RDD.take(2), 'ok'
        predicted_ratings_RDD = self.get_predict_ratings(test_ratings_RDD)
        ratings_and_preds = test_ratings_RDD.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).\
                join(predicted_ratings_RDD)
        error = math.sqrt(ratings_and_preds.map(lambda x: (x[1][0] - x[1][1])**2).mean())
        print 'the RMSE for the model is %s' %  error    

    
    def get_predict_ratings(self, user_movie_RDD):
        """Gets predictions for a given (userID, movieID) formatted RDD
        this is for checking the prediction accuracy
        """
        newMovie_rating = user_movie_RDD.map(lambda x: (x[0], x[1]))
        predicted_rating_RDD = self.model.predictAll(newMovie_rating).map(lambda x: ((x[0], x[1]), x[2]))
        return predicted_rating_RDD


    def recommend_top_movies(self, user_movies_RDD):
        """Recommends up to movies_count top UNrated movies to user_id
        """
        print user_movies_RDD.take(10)
        # Get pairs of (userID, movieID) for user_id unrated movies
        
        the_user = user_movies_RDD.map(lambda x: x[0]).take(1)[0]
        print the_user
        
        user_rated_movies_ids = user_movies_RDD.map(lambda x: x[1]).collect() # get rated movie IDs, a list
        user_unrated_movies_RDD = self.movies_RDD.filter(lambda x: x[0] not in user_rated_movies_ids).\
                    map(lambda x: (the_user, x[0]))
        
        #print user_unrated_movies_RDD.take(10)
        # Get predicted ratings
        print self.get_predict_ratings(user_unrated_movies_RDD).take(5)
        
        """note in the lambda function module is not applicable, i.e. (lambda x: self..()) doesn' work!"""
        movie_ID_counts = self.movie_ID_counts
        
        ratings = self.get_predict_ratings(user_unrated_movies_RDD).\
              filter(lambda x: movie_ID_counts[x[0][1]] > 200).sortBy(lambda x: -x[1])
        movie_titles = dict(self.movies_RDD.collect())
        #print movie_titles
        
        sortedRatings_recommd = ratings.map(lambda x: (x[0][1], movie_titles[x[0][1]], movie_ID_counts[x[0][1]], x[1]))
        
        sqlContext = SQLContext(self.sc)
        schema = sqlContext.createDataFrame(sortedRatings_recommd)
        schema.registerTempTable("recommend_table")
        pd = sqlContext.sql("SELECT * from recommend_table limit 10")
        pd.show()

## **1M rating data**

### **Training a model**

In [8]:
rank = 8
seed = 5L
iterations = 6
regularization_parameter = 0.06
ratings_file = 'data/ratings_1m.csv'
movies_file = 'data/movies_1m.csv'
engine = RecommendationEngine(ratings_file, movies_file, rank, seed, iterations, regularization_parameter)

[Rating(user=1, product=1193, rating=5.0), Rating(user=1, product=661, rating=3.0)]
[(1, u'Toy Story (1995)'), (2, u'Jumanji (1995)')]
8 42 6 0.06
For rank 4, reg 0.001, the RMSE is 0.922987744342
For rank 4, reg 0.01, the RMSE is 0.895466287827
For rank 4, reg 0.03, the RMSE is 0.887010542566
For rank 4, reg 0.06, the RMSE is 0.883851963524
For rank 4, reg 0.1, the RMSE is 0.88669284454
For rank 4, reg 0.15, the RMSE is 0.89688534893
For rank 4, reg 0.2, the RMSE is 0.911706442458
For rank 4, reg 1.0, the RMSE is 1.35149487788
For rank 8, reg 0.001, the RMSE is 0.975113517888
For rank 8, reg 0.01, the RMSE is 0.916084211348
For rank 8, reg 0.03, the RMSE is 0.888832865595
For rank 8, reg 0.06, the RMSE is 0.877317619184
For rank 8, reg 0.1, the RMSE is 0.880366475884
For rank 8, reg 0.15, the RMSE is 0.896353898071
For rank 8, reg 0.2, the RMSE is 0.913529607823
For rank 8, reg 1.0, the RMSE is 1.35157030746
For rank 12, reg 0.001, the RMSE is 1.07823264395
For rank 12, reg 0.01, the 

In [9]:
engine.ratings_accuracy(engine.testData)

the RMSE for the model is 0.879790106735


In [114]:
'''specify a user:'''

userID = 550

user_and_movie = engine.ratings_RDD.filter(lambda x: x[0]==userID)
predictions = engine.get_predict_ratings(user_and_movie)
print predictions.take(3)

[((550, 1084), 3.7456910703564064), ((550, 3702), 3.1341887208267254), ((550, 1894), 2.8582829021343246)]


### **Compare predictions and variables**

In [127]:
print 'positive rating'
print user_and_movie.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).join(predictions).\
                filter(lambda x: x[1][0] > 4).take(5)
print
print 'negative rating'
print user_and_movie.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).join(predictions).\
                filter(lambda x: x[1][0] < 2).take(5)

positive rating
[((550, 1223), (5.0, 4.338440475035002)), ((550, 2987), (5.0, 3.7390153955532703)), ((550, 1603), (5.0, 3.137756037619561)), ((550, 2265), (5.0, 2.6132217180317077)), ((550, 3052), (5.0, 4.048984187669675))]

negative rating
[((550, 2448), (1.0, 2.0447964146147912)), ((550, 1014), (1.0, 2.783482625902344)), ((550, 2405), (1.0, 3.184010599410342)), ((550, 3608), (1.0, 3.1193326777993504)), ((550, 2364), (1.0, 2.6580056009088473))]


In [116]:
engine.model.userFeatures().take(2)

[(1,
  array('d', [-0.9535652995109558, -1.2470885515213013, 0.001925684162415564, 1.9395447969436646, -0.024253690615296364, -1.2255768775939941, 0.8556669354438782, -0.39938247203826904])),
 (2,
  array('d', [-0.540212094783783, -1.2368093729019165, 0.5662597417831421, 1.8081927299499512, 0.40515658259391785, -0.5440500974655151, 0.9798134565353394, -0.8582444190979004]))]

### **Predict unrated movies' ratings**

In [77]:
engine.recommend_top_movies(user_and_movie)

[Rating(user=550, product=2987, rating=5.0), Rating(user=550, product=1248, rating=3.0), Rating(user=550, product=715, rating=3.0), Rating(user=550, product=2988, rating=3.0), Rating(user=550, product=571, rating=3.0), Rating(user=550, product=3934, rating=4.0), Rating(user=550, product=2052, rating=2.0), Rating(user=550, product=1250, rating=3.0), Rating(user=550, product=2053, rating=2.0), Rating(user=550, product=3793, rating=4.0)]
550
[((550, 384), 1.9965907831703913), ((550, 3007), 3.5269112527425586), ((550, 667), 2.3247788835275194), ((550, 1053), 2.4286723949838613), ((550, 2493), 3.8307446433748735)]
_1   _2                   _3   _4                
1178 Paths of Glory (1... 230  4.48076210206332  
2762 Sixth Sense, The ... 2459 4.462026137726065 
3089 Bicycle Thief, Th... 252  4.438253502137965 
2692 Run Lola Run (Lol... 1072 4.421856850388411 
3949 Requiem for a Dre... 304  4.40822166253173  
1269 Arsenic and Old L... 672  4.391349556740996 
898  Philadelphia Stor... 582  4.

In [32]:
engine.model.recommendUsersForProducts(userID).take(4)

AttributeError: 'MatrixFactorizationModel' object has no attribute 'recommendUsersForProducts'

### **Item-item similarity**

In [117]:
#print type(engine.model.productFeatures()) 
#http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.recommendation.MatrixFactorizationModel.productFeatures

'''meaning rating 5'''
post1_ID = 1223
post2_ID = 3052

post1_movie = engine.model.productFeatures().filter(lambda x: x[0]==post1_ID).collect()
post2_movie = engine.model.productFeatures().filter(lambda x: x[0]==post2_ID).collect()

'''meaning rating =1'''
negt1_ID = 2448
negt2_ID = 2364

negt1_movie = engine.model.productFeatures().filter(lambda x: x[0]==negt1_ID).collect()
negt2_movie = engine.model.productFeatures().filter(lambda x: x[0]==negt2_ID).collect()

'''high predicted rating'''

pred1_ID = 1178
pred2_ID = 2762

pred1_movie = engine.model.productFeatures().filter(lambda x: x[0]==pred1_ID).collect()
pred2_movie = engine.model.productFeatures().filter(lambda x: x[0]==pred2_ID).collect()


post1_movie = np.array(post1_movie[0][1].tolist())
post2_movie = np.array(post2_movie[0][1].tolist())

negt1_movie = np.array(negt1_movie[0][1].tolist())
negt2_movie = np.array(negt2_movie[0][1].tolist())

pred1_movie = np.array(pred1_movie[0][1].tolist())
pred2_movie = np.array(pred2_movie[0][1].tolist())

print 'should be relatively small'
print cosineSim(post1_movie,post2_movie), cosineSim(negt1_movie,negt2_movie), cosineSim(pred1_movie,pred2_movie)
print
print 'should be relatively large'
print cosineSim(post1_movie,negt1_movie), cosineSim(post1_movie,negt2_movie), cosineSim(post2_movie,negt1_movie),\
                    cosineSim(post2_movie,negt2_movie)
print 
print 'should be relatively small for the first two'    
print cosineSim(post1_movie,pred1_movie), cosineSim(post1_movie,pred2_movie), cosineSim(negt1_movie,pred1_movie)

should be relatively small
0.383843817711 1.00215988188 0.426481568587

should be relatively large
1.2227077752 0.739782649754 1.1763356083 1.03340273845

should be relatively small for the first two
0.0591170404825 0.274441017481 1.51995750537


In [155]:
print engine.movie_ID_counts[51]

0


In [122]:
user_features = np.array(engine.model.userFeatures().filter(lambda x: x[0]==userID).collect()[0][1].tolist())
print user_features

[-0.57763553 -0.59553528  0.02866167  1.93420017  0.45201772 -1.25490928
 -0.01410904 -1.14327478]


#### **the user-500 should be similar to the moveies he/she rated with high socres:**

In [124]:
print cosineSim(post1_movie, user_features.T), cosineSim(post2_movie, user_features.T)

0.113211930747 0.246259350281


#### ** Whereas the user-500 should be far similar to the moveies he/she rated with low socres:**

In [125]:
print cosineSim(negt1_movie, user_features.T), cosineSim(negt2_movie, user_features.T)

0.972324926964 0.721913547653


##**100k data**

### **Training models**

In [2]:
rank = 8
seed = 5L
iterations = 6
regularization_parameter = 0.06
ratings_file = 'data/ratings_100k.csv'
movies_file = 'data/movies_100k.csv'
engine_100k = RecommendationEngine(ratings_file, movies_file, rank, seed, iterations, regularization_parameter)

[Rating(user=196, product=242, rating=3.0), Rating(user=186, product=302, rating=3.0)]
[(1, u'Toy Story (1995)'), (2, u'GoldenEye (1995)')]
8 42 6 0.06
For rank 4, reg 0.001, the RMSE is 1.11801578448
For rank 4, reg 0.01, the RMSE is 1.02726783982
For rank 4, reg 0.03, the RMSE is 0.988689096927
For rank 4, reg 0.06, the RMSE is 0.959936904874
For rank 4, reg 0.1, the RMSE is 0.947575079548
For rank 4, reg 0.15, the RMSE is 0.947851716712
For rank 4, reg 0.2, the RMSE is 0.956440227836
For rank 4, reg 1.0, the RMSE is 1.38091279695
For rank 8, reg 0.001, the RMSE is 1.3401866124
For rank 8, reg 0.01, the RMSE is 1.11944012977
For rank 8, reg 0.03, the RMSE is 1.03021556546
For rank 8, reg 0.06, the RMSE is 0.980759802157
For rank 8, reg 0.1, the RMSE is 0.955833042749
For rank 8, reg 0.15, the RMSE is 0.950613839413
For rank 8, reg 0.2, the RMSE is 0.95912901433
For rank 8, reg 1.0, the RMSE is 1.38091978011
For rank 12, reg 0.001, the RMSE is 1.52374275132
For rank 12, reg 0.01, the 

In [4]:
engine_100k.ratings_accuracy(engine_100k.testData)

the RMSE for the model is 0.94104599962


In [133]:
'''specify a user:'''

userID = 8

user_and_movie = engine_100k.ratings_RDD.filter(lambda x: x[0]==userID)
predictions = engine_100k.get_predict_ratings(user_and_movie)
print predictions.take(3)

[((8, 338), 2.9505647236305528), ((8, 79), 4.195204064854052), ((8, 259), 2.6662304335158393)]


### **Compare predictions and variables**

In [134]:
print 'positive rating'
print user_and_movie.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).join(predictions).\
                filter(lambda x: x[1][0] > 4).take(5)
print
print 'negative rating'
print user_and_movie.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).join(predictions).\
                filter(lambda x: x[1][0] < 2).take(5)

positive rating
[((8, 56), (5.0, 4.288234332092983)), ((8, 258), (5.0, 4.082631274802845)), ((8, 127), (5.0, 4.60160047218472)), ((8, 188), (5.0, 3.9144191981544147)), ((8, 176), (5.0, 4.1482046392356))]

negative rating
[((8, 385), (1.0, 3.562812001730816)), ((8, 688), (1.0, 1.2239246254269873)), ((8, 259), (1.0, 2.6662304335158393)), ((8, 457), (1.0, 1.397414128548851)), ((8, 687), (1.0, 1.5768167099718895))]


### **Predict unrated movies' ratings**

In [135]:
engine_100k.recommend_top_movies(user_and_movie)

[Rating(user=8, product=338, rating=4.0), Rating(user=8, product=550, rating=3.0), Rating(user=8, product=22, rating=5.0), Rating(user=8, product=50, rating=5.0), Rating(user=8, product=182, rating=5.0), Rating(user=8, product=79, rating=4.0), Rating(user=8, product=294, rating=3.0), Rating(user=8, product=457, rating=1.0), Rating(user=8, product=385, rating=1.0), Rating(user=8, product=89, rating=4.0)]
8
[((8, 1084), 4.402929256556682), ((8, 1410), 2.4213114219677316), ((8, 667), 2.5564203216125194), ((8, 1053), 2.8039730597771806), ((8, 466), 4.028442384637303)]
_1  _2                   _3  _4                
483 Casablanca (1942)    243 4.688850275964523 
64  Shawshank Redempt... 283 4.568973973076136 
12  Usual Suspects, T... 267 4.554684663622932 
318 Schindler's List ... 298 4.521245199772927 
515 Boot, Das (1981)     201 4.49900452109575  
357 One Flew Over the... 264 4.495014566434263 
603 Rear Window (1954)   209 4.481007287656523 
98  Silence of the La... 390 4.47739754766855

### **Item-item similarity**

In [137]:
'''meaning rating 5'''
post1_ID = 56
post2_ID = 127

post1_movie = engine_100k.model.productFeatures().filter(lambda x: x[0]==post1_ID).collect()
post2_movie = engine_100k.model.productFeatures().filter(lambda x: x[0]==post2_ID).collect()

'''meaning rating =1'''
negt1_ID = 688
negt2_ID = 457

negt1_movie = engine_100k.model.productFeatures().filter(lambda x: x[0]==negt1_ID).collect()
negt2_movie = engine_100k.model.productFeatures().filter(lambda x: x[0]==negt2_ID).collect()

'''high predicted rating'''

pred1_ID = 176   ## predicted post
pred2_ID = 687   ## predicted neg

pred1_movie = engine_100k.model.productFeatures().filter(lambda x: x[0]==pred1_ID).collect()
pred2_movie = engine_100k.model.productFeatures().filter(lambda x: x[0]==pred2_ID).collect()


post1_movie = np.array(post1_movie[0][1].tolist())
post2_movie = np.array(post2_movie[0][1].tolist())

negt1_movie = np.array(negt1_movie[0][1].tolist())
negt2_movie = np.array(negt2_movie[0][1].tolist())

pred1_movie = np.array(pred1_movie[0][1].tolist())
pred2_movie = np.array(pred2_movie[0][1].tolist())


print 'should be relatively small for the first one (neg-neg is not necessary close!)'
print cosineSim(post1_movie,post2_movie), cosineSim(negt1_movie,negt2_movie)
print
print 'should be relatively large'
print cosineSim(post1_movie,negt1_movie), cosineSim(post1_movie,negt2_movie), cosineSim(post2_movie,negt1_movie),\
                    cosineSim(post2_movie,negt2_movie)
print 
print 'should be relatively large'    
print cosineSim(pred1_movie,pred2_movie)


should be relatively small for the first one (neg-neg is not necessary close!)
0.0188153553801 0.662125921388

should be relatively large
1.10847133836 1.18978141822 0.992705525385 0.950500438739

should be relatively large
0.743705023122


In [140]:
user_features = np.array(engine_100k.model.userFeatures().filter(lambda x: x[0]==userID).collect()[0][1].tolist())
print user_features

[-1.23683262 -1.74224102 -0.03648547  0.66756839]


In [141]:
print cosineSim(post1_movie, user_features.T), cosineSim(post2_movie, user_features.T)

0.11664627668 0.0517930143915


In [142]:
print cosineSim(negt1_movie, user_features.T), cosineSim(negt2_movie, user_features.T)

1.11541520943 0.719516809585


##**Now assume we have a new user who has rated**

In [10]:
newUser_ratings = [
     Rating(0,260,4), # Star Wars (1977)
     Rating(0,1,3), # Toy Story (1995)
     Rating(0,16,3), # Casino (1995)
     Rating(0,25,4), # Leaving Las Vegas (1995)
     Rating(0,32,4), # Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
     Rating(0,335,1), # Underneath (1995)
     Rating(0,379,1), # Timecop (1994)
     Rating(0,296,3), # Pulp Fiction (1994)
     Rating(0,858,5) , # Godfather, The (1972)
     Rating(0,50,4) # Usual Suspects, The (1995)
    ]

In [11]:
newUser_ratings_RDD = engine.add_ratings(newUser_ratings)
## newUser_ratings_RDD is only new user's rating RDD.

8 42 6 0.06
For rank 4, reg 0.001, the RMSE is 0.904163396926
For rank 4, reg 0.01, the RMSE is 0.888543319507
For rank 4, reg 0.03, the RMSE is 0.882919134443
For rank 4, reg 0.06, the RMSE is 0.880583352363
For rank 4, reg 0.1, the RMSE is 0.883409649717
For rank 4, reg 0.15, the RMSE is 0.893538980472
For rank 4, reg 0.2, the RMSE is 0.908244980544
For rank 4, reg 1.0, the RMSE is 1.35152666631
For rank 8, reg 0.001, the RMSE is 0.983675007946
For rank 8, reg 0.01, the RMSE is 0.915240361196
For rank 8, reg 0.03, the RMSE is 0.889541114291
For rank 8, reg 0.06, the RMSE is 0.881373993055
For rank 8, reg 0.1, the RMSE is 0.88776764348
For rank 8, reg 0.15, the RMSE is 0.905968770826
For rank 8, reg 0.2, the RMSE is 0.92433251938
For rank 8, reg 1.0, the RMSE is 1.35225410714
For rank 12, reg 0.001, the RMSE is 1.06518487852
For rank 12, reg 0.01, the RMSE is 0.945138960457
For rank 12, reg 0.03, the RMSE is 0.899947183589
For rank 12, reg 0.06, the RMSE is 0.881755523356
For rank 12,

In [12]:
engine.valData = engine.valData.union(newUser_ratings_RDD)
engine.ratings_accuracy(engine.valData)

the RMSE for the model is 0.880572076121


In [52]:
newUser_predictedRatings_RDD = engine.get_predict_ratings(newUser_ratings_RDD)
print newUser_predictedRatings_RDD.take(10)

[((0, 1), 3.3395732691672184), ((0, 379), 1.4788310248877243), ((0, 858), 3.9573811205889897), ((0, 296), 3.7549873617519243), ((0, 50), 3.8390424479761176), ((0, 16), 3.027568750552986), ((0, 260), 3.650261669516125), ((0, 25), 3.2876578137207826), ((0, 335), 1.9651433896832557), ((0, 32), 3.12707648540166)]


In [53]:
print newUser_ratings_RDD.map(lambda x: ((int(x[0]), int(x[1])), float(x[2]))).join(newUser_predictedRatings_RDD).take(10)

[((0, 32), (4.0, 3.12707648540166)), ((0, 858), (5.0, 3.9573811205889897)), ((0, 50), (4.0, 3.8390424479761176)), ((0, 16), (3.0, 3.027568750552986)), ((0, 260), (4.0, 3.650261669516125)), ((0, 296), (3.0, 3.7549873617519243)), ((0, 1), (3.0, 3.3395732691672184)), ((0, 25), (4.0, 3.2876578137207826)), ((0, 379), (1.0, 1.4788310248877243)), ((0, 335), (1.0, 1.9651433896832557))]


## **Other tests**

In [103]:
engine.movie_ID_counts[2]

701

In [106]:
engine.movie_ID_avgRating_RDD.take(1)[0][1]/engine.movie_ID_counts[2]

3.20114122681883

In [34]:
r1 = (1, 1, 1.0)
r2 = (1, 2, 2.0)
r3 = (2, 1, 2.0)
ratings = sc.parallelize([r1, r2, r3])
model = ALS.trainImplicit(ratings, 1, seed=10)
model.predict(2, 2)

0.44734950773855076

In [35]:
model = ALS.train(ratings, 1, seed=10)
model.predict(2, 2)

3.7354516875093964

In [36]:
testset = sc.parallelize([(1, 2), (1, 1)])
model = ALS.train(ratings, 2, seed=0)
model.predictAll(testset).collect()

[Rating(user=1, product=1, rating=1.0000205743675004),
 Rating(user=1, product=2, rating=1.9894577401518987)]

In [37]:
model = ALS.train(ratings, 4, seed=10)
model.userFeatures().collect()

[(1,
  array('d', [-0.1722072809934616, -0.2546672224998474, 0.4450570344924927, 0.8685435652732849])),
 (2,
  array('d', [0.03678629547357559, 0.13991205394268036, -0.4741528630256653, 0.8534172177314758]))]

In [42]:
model.productFeatures().collect()

[(1,
  array('d', [0.0753466933965683, 0.2865716814994812, -0.9711728096008301, 1.7479923963546753])),
 (2,
  array('d', [-0.3266752362251282, -0.482692152261734, 0.8424565196037292, 1.6525822877883911]))]

In [41]:
from pyspark.mllib.recommendation import MatrixFactorizationModel
model.recommendUsersForProducts(1).collect()

AttributeError: 'MatrixFactorizationModel' object has no attribute 'recommendUsersForProducts'

In [43]:
model.recommendProductsForUsers(1).collect()

AttributeError: 'MatrixFactorizationModel' object has no attribute 'recommendProductsForUsers'

In [45]:
print SparkContext.version

<property object at 0xb208ad74>
