In [4]:
import numpy as np
import pyspark as ps
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf, col, when
from pyspark.sql import SQLContext
from pyspark.sql import Row

In [5]:
from IPython.display import Image
from IPython.display import display

In [6]:
#creating spark session
spark = ps.sql.SparkSession.builder.master("local").appName("Book Recommendation System").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)



In [8]:
#load the ratings data
ratings_df = spark.read.csv('dataset/ratings.csv', header = True, inferSchema = True)
ratings_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [11]:
ratings_df.show()

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
|      1|   2077|     4|
|      1|   2487|     4|
|      1|   2900|     5|
|      1|   3662|     4|
|      1|   3922|     5|
|      1|   5379|     5|
|      1|   5461|     3|
|      1|   5885|     5|
|      1|   6630|     5|
|      1|   7563|     3|
|      1|   9246|     1|
|      1|  10140|     4|
|      1|  10146|     5|
|      1|  10246|     4|
|      1|  10335|     4|
+-------+-------+------+
only showing top 20 rows



In [14]:
#load the books data
books_df = spark.read.csv('dataset/books.csv', header = True, inferSchema = True)
books_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- best_book_id: integer (nullable = true)
 |-- work_id: integer (nullable = true)
 |-- books_count: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: double (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: double (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: double (nullable = true)
 |-- ratings_2: integer (nullable = true)
 |-- ratings_3: integer (nullable = true)
 |-- ratings_4: integer (nullable = true)
 |-- ratings_5: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (nullable = true)


In [16]:
books_df.show(5)

+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
| id|book_id|best_book_id|work_id|books_count|     isbn|          isbn13|             authors|original_publication_year|      original_title|               title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|           image_url|     small_image_url|
+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------

In [17]:
#splitting data into training and validation sets
training_df, validation_df = ratings_df.randomSplit([0.8, 0.2])

In [18]:
#Defining random values of Hyperparameters
iterations = 10
reg_param = 0.1
rank = 4
errors = []
err = 0

In [21]:
#Setting up the ALS algorithm
#Using CrossValidator to find best hyperparameters
als = ALS(maxIter = iterations, regParam = reg_param, rank = rank, userCol = "user_id", itemCol = "book_id", ratingCol = "rating")
paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.1, 0.01, 0.18]).addGrid(als.rank, range(4, 10)).build()


#model = als.fit(training_df)
#predictions = model.transform(validation_df)
#new_preds = predictions.filter(col('prediction')!=np.nan)
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction")
crossVal = CrossValidator(estimator = als, estimatorParamMaps = paramGrid, evaluator = evaluator, numFolds = 5)
cvModel = crossVal.fit(training_df)
#print("Root Mean Squared Error Value = " + str(rmse))

In [25]:
# Get best model from CrossValidator
best_model = cvModel.bestModel

# Extract best parameters
best_rank = best_model.rank
best_reg_param = best_model._java_obj.parent().getRegParam()

print("Best Rank:", best_rank)
print("Best Regularization Parameter:", best_reg_param)


Best Rank: 4
Best Regularization Parameter: 0.1


In [28]:
cvModel_pred = cvModel.transform(validation_df)
cvModel_pred = cvModel_pred.filter(col('prediction')!=np.nan)
rmse = evaluator.evaluate(cvModel_pred)
print("Root Mean Squared Error Value = " + str(rmse))


Root Mean Squared Error Value = 0.8900706543806879


In [29]:
predictions = cvModel.transform(validation_df)
predictions.show(n=10)

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      1|   5461|     3|  4.304631|
|      1|  10610|     5|  4.289937|
|      1|  11854|     4| 3.9292023|
|      1|  12471|     5| 3.9300416|
|      1|  17663|     5|  4.874737|
|      1|  21713|     5| 4.5148163|
|      1|  22602|     4| 3.6862981|
|      1|  23576|     4| 3.6700437|
|      1|  23612|     4| 3.9629426|
|      1|  24326|     5| 4.0609474|
+-------+-------+------+----------+
only showing top 10 rows



In [32]:
#predicting the ratings that a user would give to a certain "book"
predictions.join(books_df, "book_id").select("user_id","title","prediction").show(5)

+-------+--------------------+----------+
|user_id|               title|prediction|
+-------+--------------------+----------+
|  32592|Harry Potter and ...| 3.2834768|
|  32592|The Hitchhiker's ...|  4.148359|
|  32592|I'm a Stranger He...|  4.260467|
|  32592|The Lost Continen...| 3.6175382|
|  35982|J.R.R. Tolkien 4-...| 3.6911342|
+-------+--------------------+----------+
only showing top 5 rows



In [40]:
#Predicting what book a certain user would like and what rating they would give
user_one = predictions.filter(col("user_id")==35982).join(books_df, "book_id").select("user_id","title","prediction", "image_url")
user_one.show()

+-------+--------------------+----------+--------------------+
|user_id|               title|prediction|           image_url|
+-------+--------------------+----------+--------------------+
|  35982|The Language Inst...| 2.6682665|https://s.gr-asse...|
|  35982|  Great Expectations| 2.5731409|https://images.gr...|
|  35982|Veronika Decides ...| 3.6128395|https://s.gr-asse...|
|  35982|The Portrait of a...| 3.6596348|https://images.gr...|
|  35982|J.R.R. Tolkien 4-...| 3.6911342|https://images.gr...|
+-------+--------------------+----------+--------------------+



In [41]:
#displaying the book images predicted for above user
for book in user_one.take(10):
    print(book.title)
    display(Image(url=book.image_url))

The Language Instinct: How the Mind Creates Language


Great Expectations


Veronika Decides to Die


The Portrait of a Lady


J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings


In [47]:
model = cvModel.bestModel
#Generate top 5 book recommendations for each user
userRecommendations = model.recommendForAllUsers(5)
userRecommendations.select("user_id", "recommendations.book_id").show(10, False)


+-------+------------------------------+
|user_id|book_id                       |
+-------+------------------------------+
|1      |[3753, 2636, 3628, 2840, 5207]|
|2      |[4154, 1338, 2236, 3746, 4868]|
|3      |[4868, 7537, 9076, 4336, 9008]|
|4      |[3628, 5207, 1788, 6920, 6590]|
|5      |[192, 6590, 862, 9516, 562]   |
|6      |[9486, 2636, 3753, 5753, 7352]|
|7      |[1788, 3628, 6590, 5580, 7455]|
|8      |[4154, 1338, 4868, 6084, 2236]|
|9      |[3628, 6590, 4868, 1788, 5580]|
|10     |[2236, 6902, 7947, 9076, 1338]|
+-------+------------------------------+
only showing top 10 rows



In [49]:
#Generate top 5 readers for each book
bookRecommendations = model.recommendForAllItems(5)
bookRecommendations.select("book_id", "recommendations.user_id").show(10, False)

+-------+-----------------------------------+
|book_id|user_id                            |
+-------+-----------------------------------+
|1      |[50307, 43442, 49360, 26219, 50580]|
|2      |[50307, 43442, 23662, 49360, 50580]|
|3      |[24063, 41031, 21791, 31685, 50307]|
|4      |[18663, 43442, 46126, 49360, 26219]|
|5      |[43442, 18663, 47161, 46126, 21925]|
|6      |[43442, 50307, 49360, 23662, 47161]|
|7      |[43442, 47161, 23662, 49360, 42139]|
|8      |[43442, 18663, 46126, 47161, 26219]|
|9      |[24063, 50307, 31685, 3655, 22576] |
|10     |[23662, 43442, 50307, 49360, 50580]|
+-------+-----------------------------------+
only showing top 10 rows



In [52]:
#Generate top 10 book recommendations for subset of users
users = ratings_df.select("user_id").distinct().limit(3);
userSubsetRecommendations = model.recommendForUserSubset(users, 10)
userSubsetRecommendations.select("user_id", "recommendations.book_id").show(10, False)

+-------+------------------------------------------------------------+
|user_id|book_id                                                     |
+-------+------------------------------------------------------------+
|32592  |[3753, 2840, 5207, 3628, 2636, 6920, 267, 4483, 1788, 8187] |
|35982  |[1788, 3628, 6590, 5580, 4868, 9566, 4706, 7455, 6435, 5207]|
|19984  |[8548, 7947, 4778, 1308, 6902, 9531, 422, 9569, 8187, 3753] |
+-------+------------------------------------------------------------+



In [None]:
#Generate top 10 user recommendations for subset of books

In [54]:
books = ratings_df.select("book_id").distinct().limit(3);
bookSubsetRecommendations = model.recommendForItemSubset(books, 10)
bookSubsetRecommendations.select("book_id", "recommendations.user_id").show(10, False)

+-------+---------------------------------------------------------------------+
|book_id|user_id                                                              |
+-------+---------------------------------------------------------------------+
|471    |[23662, 43442, 50307, 49360, 31685, 38866, 50580, 46065, 7571, 37076]|
|463    |[50307, 24063, 31685, 23662, 32300, 31764, 11548, 3655, 22576, 2404] |
|148    |[23662, 43442, 49360, 50307, 50580, 7571, 13108, 38866, 51993, 26219]|
+-------+---------------------------------------------------------------------+

