In [1]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from time import *
from math import sqrt


In [2]:
# In this project we have tried to create a distributed recommender system using Apache spark and Databricks cloud.
# We have tried to implement a similar recommender in graphlab which was run on my desktop. As part of the project we have tried to compare these two recommender systems.
# PLEASE NOTE:- while creating the notebook on Databrick integrated platform iam not able to use the markdown feature, hence the formating of this notebook is a bit off.

In [3]:
# Databrick Setup
# We got the community license for the Databricks. Using this account i created a cluster with spark version 2.3
# Once the cluster was up and running, using Databricks integrated platform i created a Python notebook and implement recommender system using Apache MLLib algorithms.
# In order to benchmark the two algorithms we have time module which provides the total execution time in secs.
# This script contains the python code that was run on the Apache spark platform.
# Script that was run on the desktop can be found in this location.[]


In [4]:
# U.data contains product rating for 100000 users. This file was upload to Apache platform using Databrick upload feature.
start = time()
ratings_data = sc.textFile("/FileStore/tables/u.data")

In [5]:
ratings_data.count()

In [6]:
# This steps load the rating into Spark RDD distributed data structure.
ratings = data.map(lambda l: l.split('\t'))\
    .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
ratings.count()

In [7]:
# This steps does the random split of rating data. Training data is 70% and Test data is 30%
ratings_test, ratings_train = ratings.randomSplit(weights=[0.3, 0.7], seed=1)
ratings_train.count()

In [8]:
# Build the recommendation model using Alternating Least Squares of Apache Spark MLLib algorithm
rank = 10
numIterations = 10
model = ALS.train(ratings_train, rank, numIterations)

In [9]:
# Evaluate the model on training data. This step calculates RMSE value of predicted test data using the Model that was build in the last step.
testdata = ratings_test.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings_test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
RMSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Root Mean Squared Error = " + str(sqrt(MSE)))

In [10]:
# Total time take is 23 secs on the databrick cloud.
end = time()
print (end  - start)

In [11]:
# Comparision between two Recommender systems.
#Recommender 1:- Was build as part of this script. This recommender was build on Databrick cloud using Apache spark MLLib library.
#Recommender 2:- Was build on my local machine using GraphLab algorithms. Both recommender are using Collabrative filtering algorithms.
#Recommender 1 is definately faster than Recommender2. It took 23 secs where as Recommender 2 took 32 secs. Again in th Databrick community edition i was allowed to create only one cluster. This different would have been amplified if there were more than one clusters.
#Recommender 1 has a RMSE of 1.12 where Recommender 2 has a value of 1.01. Although both recommender's RMSE are not good but looks like Recommender 2 provided better prediction. Bad RMSE value could be attributed to data sparsity. 

#Based on this project, Apache spark is really fast on larger Datasets, espically when there are multiple cluster on the cloud. Iam not sure enough on the MLlib recommender algorithm prediction as graphlab provided better performance on the same data
