In [1]:
#Get the RMSE for the MovieLens 1M

In [1]:
import numpy as np
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from math import sqrt
from operator import add

In [2]:
sc.addPyFile('hermes.zip')
from src.algorithms import cf
from src.algorithms import performance_metrics
from src.data_prep import movieLens_vectorize as mv

In [3]:
from src.algorithms import content_based

##Bring in data and create a user vector

In [4]:
movies = sqlCtx.read.json(
    'movielens/1m/movielens_1m_movies.json.gz',
)

In [5]:
ratings = sqlCtx.read.json('movielens/1m/movielens_1m_ratings.json.gz')

In [6]:
#set up the vectorizer
mv_vect = mv.movieLens_vectorize(ratings, movies, 'ratings', 'genre')

In [7]:
#get the user vector
all_user_ratings = mv_vect.get_user_vector()

In [8]:
all_user_ratings.take(10)

[(1, 1193, 5.0),
 (1, 661, 3.0),
 (1, 914, 3.0),
 (1, 3408, 4.0),
 (1, 2355, 5.0),
 (1, 1197, 3.0),
 (1, 1287, 5.0),
 (1, 2804, 5.0),
 (1, 594, 4.0),
 (1, 919, 4.0)]

In [9]:
#split into train, test
train_ratings, test_ratings = all_user_ratings.randomSplit([90,10], 11)

In [11]:
print train_ratings.count(), test_ratings.count()

 900025 100184


In [10]:
#get the predicted locations
predicted = cf.calc_cf_mllib(train_ratings)

In [12]:
#run a performance metric
rmse = performance_metrics.calculate_rmse_using_rdd(test_ratings, predicted)
print rmse

0.884895332015


##What do all the RDDs looks like you may ask?

In [13]:
print type(all_user_ratings)
print all_user_ratings.take(5)

<class 'pyspark.rdd.PipelinedRDD'>
[(1, 1193, 5.0), (1, 661, 3.0), (1, 914, 3.0), (1, 3408, 4.0), (1, 2355, 5.0)]


In [14]:
print type(predicted)
predicted.take(5)  
#our predicted vectors don't need the fancy labels, (though we could use NamedTuples)
#the prediction functions work on either :)

<class 'pyspark.rdd.RDD'>


[Rating(user=4904, product=3456, rating=4.074128624320234),
 Rating(user=3456, product=3456, rating=2.560508811974863),
 Rating(user=3272, product=3456, rating=3.0780781662221983),
 Rating(user=752, product=3456, rating=6.015084508693184),
 Rating(user=4352, product=3456, rating=6.613422183304756)]

##So this works for CF, what does content based look like?

In [13]:
movie_content_vector = mv_vect.get_content_vector().repartition(50)

In [14]:
movie_content_vector.take(5)

[(1, array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 (2, array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 (3, array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])),
 (4, array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 (5, array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))]

In [15]:
#get the predicted locations
predicted_2 = content_based.predict(train_ratings, movie_content_vector, max_prediction=5)

In [16]:
#run a performance metric
rmse = performance_metrics.calculate_rmse_using_rdd(test_ratings, predicted_2)
print rmse

1.42863242792


In [ ]:
#So the RMSE for the CF algorithm had better accuracy than the content based

In [17]:
#what do these predictions look like?
print type(predicted_2)
predicted_2.take(5)

<class 'pyspark.rdd.PipelinedRDD'>


[(5661, 1, 3.5000000000000004),
 (5151, 1, 3.3509189925119127),
 (4641, 1, 3.4479237576582706),
 (4131, 1, 3.296014729464869),
 (3621, 1, 3.1900915871237006)]