# A Collaborative Filtering project for Recommender System Using Alternating Least Squares Algorithm(ALS)

In [0]:
# Importing the necessary libraries

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col,explode

In [0]:
spark = SparkSession.builder.appName("Project_1").getOrCreate()

In [0]:
# Defining the file path

path1 = "/FileStore/tables/movies.csv"
path2 = "/FileStore/tables/ratings.csv"

In [0]:
# Reading the CSV file

movies = spark.read.csv(path1, header=True, inferSchema=True)
ratings = spark.read.csv(path2, header=True, inferSchema=True)

In [0]:
# Printing out the dataframe's schema.

movies.printSchema()
ratings.printSchema()

In [0]:
display(movies)
display(ratings)

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


userId,movieId,rating,timestamp
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
1,70,3.0,964982400
1,101,5.0,964980868
1,110,4.0,964982176
1,151,5.0,964984041
1,157,5.0,964984100


In [0]:
df = movies.join(ratings, 'movieId', 'right')

In [0]:
display(df)

movieId,title,genres,userId,rating,timestamp
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,964981247
6,Heat (1995),Action|Crime|Thriller,1,4.0,964982224
47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,964983815
50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,964982931
70,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,1,3.0,964982400
101,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance,1,5.0,964980868
110,Braveheart (1995),Action|Drama|War,1,4.0,964982176
151,Rob Roy (1995),Action|Drama|Romance|War,1,5.0,964984041
157,Canadian Bacon (1995),Comedy|War,1,5.0,964984100


In [0]:
# Splitting the data into train and test set (80%:20%)

(train, test) = df.randomSplit([0.8,0.2])

In [0]:
# DataFrame size, Train size, Test size

print("Dataframe size : " + str(df.count()))
print("Train size : " + str(train.count()))
print("Test size : " + str(test.count()))

In [0]:
# Building the recommendation model using ALS Algorithm on the training data

als = ALS(implicitPrefs= False, coldStartStrategy="drop", nonnegative=True , itemCol= 'movieId', userCol= 'userId' , ratingCol= 'rating')

In [0]:
# ParamGridBuilder to construct a grid of parameters to search over.

param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

In [0]:
# Metric to measure how well the model performs on the test data (Root Mean Square Error)

evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="rating", 
           predictionCol="prediction")

In [0]:
# Cross-validation for model performance measurement

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

In [0]:
# Training the model

model = cv.fit(train)
best_model = model.bestModel
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

In [0]:
# The root mean squared error (RMSE) to measure error of the model

print(RMSE)

In [0]:
# Recommendation based on the model

top_recommendations = best_model.recommendForAllUsers(3)

In [0]:
display(top_recommendations)

userId,recommendations
1,"List(List(3379, 5.6086335), List(33649, 5.572328), List(132333, 5.542776))"
2,"List(List(33649, 4.6531262), List(3379, 4.6301174), List(74282, 4.5470786))"
3,"List(List(6835, 4.85197), List(5746, 4.85197), List(70946, 4.8093114))"
4,"List(List(4765, 4.7654986), List(25825, 4.7471814), List(1046, 4.7145658))"
5,"List(List(3379, 4.4748516), List(132333, 4.453083), List(5490, 4.453083))"
6,"List(List(33649, 4.794703), List(74282, 4.7539625), List(67618, 4.714146))"
7,"List(List(6818, 4.7023034), List(3379, 4.698065), List(92494, 4.5171165))"
8,"List(List(3379, 4.770545), List(33649, 4.5892406), List(6818, 4.561652))"
9,"List(List(3379, 4.594421), List(6818, 4.5482078), List(33649, 4.5370383))"
10,"List(List(71579, 4.4729085), List(8869, 4.354601), List(113275, 4.3501625))"


In [0]:
# Top recommendation and their ratings

top_rec = top_recommendations.withColumn("movieid_rating", explode("recommendations"))
display(top_rec)

userId,recommendations,movieid_rating
1,"List(List(3379, 5.6086335), List(33649, 5.572328), List(5490, 5.542776))","List(3379, 5.6086335)"
1,"List(List(3379, 5.6086335), List(33649, 5.572328), List(5490, 5.542776))","List(33649, 5.572328)"
1,"List(List(3379, 5.6086335), List(33649, 5.572328), List(5490, 5.542776))","List(5490, 5.542776)"
2,"List(List(33649, 4.6531262), List(3379, 4.6301174), List(74282, 4.5470786))","List(33649, 4.6531262)"
2,"List(List(33649, 4.6531262), List(3379, 4.6301174), List(74282, 4.5470786))","List(3379, 4.6301174)"
2,"List(List(33649, 4.6531262), List(3379, 4.6301174), List(74282, 4.5470786))","List(74282, 4.5470786)"
3,"List(List(5746, 4.85197), List(6835, 4.85197), List(70946, 4.8093114))","List(5746, 4.85197)"
3,"List(List(5746, 4.85197), List(6835, 4.85197), List(70946, 4.8093114))","List(6835, 4.85197)"
3,"List(List(5746, 4.85197), List(6835, 4.85197), List(70946, 4.8093114))","List(70946, 4.8093114)"
4,"List(List(4765, 4.7654986), List(25825, 4.7471814), List(1046, 4.7145658))","List(4765, 4.7654986)"


In [0]:
# Each user movie recommendation

user_rec = display(top_rec.select("userId", col("movieid_rating.movieId"), col("movieid_rating.rating")))

userId,movieId,rating
1,3379,5.6086335
1,33649,5.572328
1,8542,5.542776
2,33649,4.6531262
2,3379,4.6301174
2,74282,4.5470786
3,6835,4.85197
3,5746,4.85197
3,70946,4.8093114
4,4765,4.7654986
