In [1]:
import os
import pyspark
conf = pyspark.SparkConf()
conf.set('spark.ui.proxyBase', '/user/' + os.environ['JUPYTERHUB_USER'] + '/proxy/4041') 
conf.set('spark.sql.repl.eagerEval.enabled', False)
conf.set('spark.driver.memory','4g')
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/21 17:19:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/21 17:19:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
spark

<pyspark.sql.context.SQLContext at 0x7fb70872a620>

## Interaction

In [9]:
interaction = spark.read.json("../shared/finals/swarali/goodreads_interactions_dedup.json")

                                                                                

In [10]:
interaction.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- is_read: boolean (nullable = true)
 |-- rating: long (nullable = true)
 |-- read_at: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_text_incomplete: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- user_id: string (nullable = true)



In [106]:
inter = interaction.cache()

[Stage 1126:(240 + 2) / 579][Stage 1128:> (0 + 0) / 2][Stage 1130:> (0 + 0) / 2]

In [66]:
interaction.limit(2).toPandas()

Unnamed: 0,book_id,date_added,date_updated,is_read,rating,read_at,review_id,review_text_incomplete,started_at,user_id
0,34684622,Tue Oct 17 09:40:11 -0700 2017,Tue Oct 17 09:40:12 -0700 2017,False,0,,a53868823f065a0e20fd4ae98b820674,,,8842281e1d1347389f2ab93d60773d4d
1,34536488,Fri Oct 13 07:19:50 -0700 2017,Fri Oct 13 07:19:50 -0700 2017,False,0,,9f08c5f991f87f3b7ae4ce779c2aac10,,,8842281e1d1347389f2ab93d60773d4d


Collaborative Filtering: Recommends books based on similarities between users' preferences and behaviors.


Content-Based Filtering: Recommends books based on the attributes of the books (genre, author, publication year) and the user's past preferences.

## Collaborative Recommendation System - FIRST 

This was the first trial system that we did on a limited 1000 records data. The FINAL Recommendation system is in the next code block

In [9]:
from pyspark.sql import SparkSession

# Create or get the Spark session
spark = SparkSession.builder.appName("GoodreadsRecommender") \
    .config("spark.kryoserializer.buffer.max", "256m") \
    .getOrCreate()

# The rest of your code here

interaction_1000 = interaction.limit(1000)

23/12/19 18:13:38 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


# Selecting relevant columns
interactions_df = interaction_1000.select("user_id", "book_id", "rating")
from pyspark.ml.feature import StringIndexer

# Indexing string identifiers
stringIndexer = StringIndexer(inputCols=["user_id", "book_id"], outputCols=["user_id_indexed", "book_id_indexed"])
model = stringIndexer.fit(interactions_df)
interactions_df = model.transform(interactions_df)

                                                                                

In [11]:
# Split the dataset
(train, test) = interactions_df.randomSplit([0.8, 0.2])

# ALS model now uses the indexed columns
als = ALS(maxIter=5, regParam=0.01, userCol="user_id_indexed", itemCol="book_id_indexed", ratingCol="rating")
model = als.fit(train)

# Evaluate the model
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

# Generate top 10 book recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show(truncate = False )

                                                                                

Root-mean-square error = nan




+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id_indexed|recommendations                                                                                                                                                                     |
+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0              |[{986, 4.9943986}, {874, 4.9943986}, {864, 4.9943986}, {862, 4.9943986}, {859, 4.9943986}, {856, 4.9943986}, {855, 4.9943986}, {836, 4.9943986}, {829, 4.9943986}, {824, 4.9943986}]|
|1              |[{471, 4.996102}, {348, 4.996102}, {300, 4.996102}, {188, 4.996102}, {177, 4.996102}, {99, 4.996102}, {3, 4.981745}, {1, 4.981745}, {892, 3.996882}, {883, 3.996882}]               |
+----

                                                                                

In [None]:
result_df = interactions_df.join(original_ids_df, ["user_id", "book_id"])

In [13]:
from pyspark.sql.functions import explode, col

# Explode the recommendations
exploded_recs = userRecs.select("user_id_indexed", explode("recommendations").alias("recommendation"))

# Extract book_id_indexed and rating from the recommendation struct
exploded_recs = exploded_recs.select("user_id_indexed", col("recommendation.book_id_indexed"), col("recommendation.rating"))

# Join with the books dataframe (assuming it has 'book_id_indexed' and 'title')
joined_df = exploded_recs.join(books, exploded_recs.book_id_indexed == books.book_id)

# Select the desired columns
final_df = joined_df.select("user_id_indexed", "title", "rating")

# Show the results
final_df.show()

                                                                                

+---------------+--------------------+---------+
|user_id_indexed|               title|   rating|
+---------------+--------------------+---------+
|              0|     The Kite Runner|4.9943986|
|              0|          The Cobweb|4.9943986|
|              1|Harry Potter and ...| 4.981745|
|              1|The Door Into Summer| 4.996102|
|              1|Pompeii: Lost and...| 3.996882|
|              0|The Trust: The Pr...|4.9943986|
|              1|Harry Potter and ...| 4.981745|
|              1|     Of Mice and Men| 3.996882|
|              1|The Player's Hand...| 4.996102|
|              0|       The Alchemist|4.9943986|
|              0|Odalisque (The Ba...|4.9943986|
+---------------+--------------------+---------+



## Collaborative Recommendation System - FINAL 

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col


In [93]:
interaction_df = interaction

In [108]:
interaction_df_ordered = interaction_df.limit(10000)

[Stage 1126:(287 + 2) / 579][Stage 1128:> (0 + 0) / 2][Stage 1130:> (0 + 0) / 2]

In [66]:
interaction_df = interaction.limit(100000)
interaction_df = interaction_df.filter(interaction_df.rating != 0)
print(interaction_df.count())
user_id_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_index")
user_id_indexer_model = user_id_indexer.fit(interaction_df)  # Save the fitted model
interaction_df = user_id_indexer_model.transform(interaction_df)

interaction_df = interaction_df.withColumn("book_id", col("book_id").cast("integer"))\
                               .withColumn("rating", col("rating").cast("float"))
interaction_df = interaction_df.na.drop(subset=["user_id_index", "book_id", "rating"])


                                                                                

44045


                                                                                

In [69]:
labels = user_id_indexer_model.labels

In [70]:
print(labels)

['a5ba0a7f4f5175ae9d2ce997f790dcb9', 'cd6522e9018f2f77332ec74f928f8c45', '01ec1a320ffded6b2dd47833f2c8e4fb', '4a44f603cc3df339acc48590044a2db0', 'dcaf63d82422e351590aba2b36950a17', '453e6a280947ac1f82a27b91e5fb89bb', '83d6e6f80d7c32c6676b3ab3b01543cd', 'dea5bda3eaee375c4ab59fd399bc37db', 'ced7b8e0a3340e8af27f2663f442c3bb', '0ef32090550901ead25cb0ea21c4d36b', '559d843b319087e12f48282e386e401f', 'a34e60cb88cea96fd767a9db81f6f5c8', '4672eb229c808b792b8ea95f01f19784', 'f67cc8786583247a1fbf3ee190cbbbc2', '8842281e1d1347389f2ab93d60773d4d', 'f18d9dc6a1e14cf6355094283298c13c', 'dc3763cdb9b2cae805882878eebb6a32', 'bafc2d50014200cda7cb2b6acd60cd73', '3cb38531c6aab7bcae3c26455b763fe6', 'f8a89075dc6de14857561522e729f82c', '220ef9c058a2132e6a9827f93a821d87', 'c7b9a63678911865302a204f45d6cc2d', '0c07498e94309381e4f79c9176f57462', '040b31603912dc03f19e0b76d58c3660', '12f32d2823b2c92362573740a2baa553', 'd37b46b2190ed7c518259f29b47a9b36', '478f3accbbb16d817fd90732215b6391', 'dfe9eff9fee4377ac76d17a1af

In [71]:
from pyspark.sql import Row

rows = [Row(user_id_index=i, original_user_id=label) for i, label in enumerate(labels)]
original_user_ids_df = spark.createDataFrame(rows)


In [72]:
# Split the data into training and testing sets
(training, test) = interaction_df.randomSplit([0.8, 0.2])

# ALS model setup
als = ALS(maxIter=5, regParam=0.01, userCol="user_id_index", itemCol="book_id", ratingCol="rating", nonnegative=True)

# Fit the model on training data
model = als.fit(training)

# Predictions on test data
predictions = model.transform(test)


                                                                                

In [49]:
predictions.select('book_id','user_id','user_id_index','rating','prediction').show(20)

                                                                                

+-------+--------------------+-------------+------+----------+
|book_id|             user_id|user_id_index|rating|prediction|
+-------+--------------------+-------------+------+----------+
|      5|72fb0d0087d28c832...|          1.0|   5.0|  4.560216|
|      1|8842281e1d1347389...|          0.0|   5.0| 1.0394906|
|      2|8842281e1d1347389...|          0.0|   5.0| 1.0394906|
|     21|8842281e1d1347389...|          0.0|   5.0|       NaN|
|     30|8842281e1d1347389...|          0.0|   5.0|       NaN|
|    112|8842281e1d1347389...|          0.0|   4.0|       NaN|
|    350|8842281e1d1347389...|          0.0|   5.0|       NaN|
|    361|8842281e1d1347389...|          0.0|   5.0|       NaN|
|    685|8842281e1d1347389...|          0.0|   4.0|       NaN|
|   1202|8842281e1d1347389...|          0.0|   4.0|       NaN|
|   1852|8842281e1d1347389...|          0.0|   5.0|       NaN|
|   1893|8842281e1d1347389...|          0.0|   5.0|       NaN|
|   1898|8842281e1d1347389...|          0.0|   4.0|    

In [74]:
predictions = predictions.na.drop(subset='prediction')


[Stage 834:(71 + 2) / 579][Stage 849:> (0 + 0) / 10][Stage 850:> (0 + 0) / 10]

In [75]:
# Evaluate the model using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")


                                                                                

Root-mean-square error = 1.616451337731502


In [76]:
# Number of recommendations to generate for each user
num_recommendations = 5  # Adjust as needed

# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(num_recommendations)

In [54]:
user_recommendations.show(10,truncate = False)



+-------------+---------------------------------------------------------------------------------------------------------+
|user_id_index|recommendations                                                                                          |
+-------------+---------------------------------------------------------------------------------------------------------+
|0            |[{460548, 5.0003824}, {304889, 5.0003824}, {128029, 5.0003824}, {119322, 5.0003824}, {99107, 5.0003824}] |
|1            |[{18767539, 4.995974}, {17939501, 4.995974}, {2718668, 4.995974}, {2088385, 4.995974}, {15881, 4.995974}]|
+-------------+---------------------------------------------------------------------------------------------------------+



                                                                                

In [77]:
from pyspark.sql.functions import explode, col

# Explode the recommendations
recommendations = user_recommendations.select("user_id_index", explode("recommendations").alias("recommendation"))
recommendations = recommendations.select("user_id_index", col("recommendation.book_id").alias("book_id"), col("recommendation.rating").alias("rating"))

In [78]:
# Join with the original user IDs
recommendations_with_user = recommendations.join(original_user_ids_df, "user_id_index")
#recommendations_with_user.show(10)

In [79]:
# Assuming 'books_df' contains columns 'book_id', 'title' (book name), and 'url'
final_recommendations = recommendations_with_user.join(books, "book_id")

In [80]:
final_output = final_recommendations.select(
    col("original_user_id"),
    col("book_id"),
    col("title").alias("book_name"),
    col("url"),
    col("image_url"),
    col("rating")
)

# Display the final output
final_output.show(1,truncate=False)


                                                                                

+--------------------------------+-------+-------------------------------------------------------------------+----------------------------------------------------+--------------------------------------------------------+---------+
|original_user_id                |book_id|book_name                                                          |url                                                 |image_url                                               |rating   |
+--------------------------------+-------+-------------------------------------------------------------------+----------------------------------------------------+--------------------------------------------------------+---------+
|faa322d2624b0e7eb3064e39dac4af9c|12067  |Good Omens: The Nice and Accurate Prophecies of Agnes Nutter, Witch|https://www.goodreads.com/book/show/12067.Good_Omens|https://images.gr-assets.com/books/1392528568m/12067.jpg|6.1418085|
+--------------------------------+-------+----------------------------------

In [47]:
final_output.orderBy('Rating').show(truncate=False)



+--------------------------------+--------+--------------------------------------------------+----------------------------------------------------------------+---------+
|original_user_id                |book_id |book_name                                         |url                                                             |rating   |
+--------------------------------+--------+--------------------------------------------------+----------------------------------------------------------------+---------+
|72fb0d0087d28c832f15776b0d936598|160495  |Never Buried (Leigh Koslow Mystery #1)            |https://www.goodreads.com/book/show/160495.Never_Buried         |5.0370326|
|72fb0d0087d28c832f15776b0d936598|18512   |The Return of the King (The Lord of the Rings, #3)|https://www.goodreads.com/book/show/18512.The_Return_of_the_King|5.0370326|
|72fb0d0087d28c832f15776b0d936598|2088385 |The Book of Negroes                               |https://www.goodreads.com/book/show/2088385.The_Book_of_

                                                                                

In [51]:
final_output.select('original_user_id').distinct().show(20,truncate = False)

[Stage 664:>                                                        (0 + 2) / 2]

+--------------------------------+
|original_user_id                |
+--------------------------------+
|72fb0d0087d28c832f15776b0d936598|
|8842281e1d1347389f2ab93d60773d4d|
+--------------------------------+



                                                                                

In [82]:
output_2 = final_output.toPandas()

                                                                                

In [83]:
output_2.to_csv('output_final.csv')  