In [1]:
import os
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


In [2]:
# initialize SparkSession
sc=SparkContext()
spark=SparkSession(sc)

In [3]:
# Create DataFrames
path = os.getcwd()
df_M = spark.read.format("csv").load(path + "/data/movies.csv")
df_R = spark.read.format("csv").load(path + "/data/ratings.csv")

In [4]:
#rename columns
df2= df_R.withColumnRenamed("_c0", "userId").withColumnRenamed("_c1", "movieId").withColumnRenamed("_c2", "rating").withColumnRenamed("_c3","timestamp")
df1= df_M.withColumnRenamed("_c0", "movieId").withColumnRenamed("_c1", "title").withColumnRenamed("_c2", "genre")

# Join DataFrames
df1=df1.join(df2,on="movieId")

# Create Train/Test split
split = df1.randomSplit([0.8,0.2],seed=1)
train = split[0]
test = split[1]

# casting
train=train.withColumn("userId", train["userId"].cast("int"))\
           .withColumn("movieId", train["movieId"].cast("int"))\
           .withColumn("rating", train["rating"].cast("float"))

test=test.withColumn("userId", test["userId"].cast("int"))\
         .withColumn("movieId", test["movieId"].cast("int"))\
         .withColumn("rating", test["rating"].cast("float"))

# remove columns where movieId, userId, or rating is null
train=train.filter(train.movieId.isNotNull())
train=train.filter(train.userId.isNotNull())
train=train.filter(train.rating.isNotNull())
test=test.filter(test.movieId.isNotNull())
test=test.filter(test.userId.isNotNull())
test=test.filter(test.rating.isNotNull())



In [5]:
%%timeit
#time the duration for fitting the model

# Alternating Least Squares matrix factorization.
als=ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy= "drop")

# we fit our model
model=als.fit(train)



3.43 s ± 578 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
# Now actually fit the model

# Alternating Least Squares matrix factorization.
als=ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy= "drop")

# we fit our model
model=als.fit(train)



In [7]:
%%timeit
#time the duration for predicting ratings

#predict the ratings for the respective pairs of movieId and userId included in the test set
predictions = model.transform(test)




23.9 ms ± 4.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
# Now actually predict the ratings for the respective pairs of movieId and userId included in the test set
predictions = model.transform(test)

In [9]:
# compute the error of our predictions
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8835261286521537


In [10]:
# Generate top movie recommendation for each user
userRecs = model.recommendForAllUsers(1)