In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# stop any existing Spark session, if Spark is already running, creating a new session might fail ***
try:
    spark.stop()
except Exception:
    pass

# create session with adjusted memory settings based on your cluster
# .config("spark.local.dir", r"E:\Apache Spark\spark-temp"): change the spark local dir, as the c disk memory is not enough, may cause Py4JJavaError exception ***
spark = SparkSession.builder.appName("BookRecommendCSV") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.local.dir", r"E:\Apache Spark\spark-temp") \
    .getOrCreate() 

In [2]:
import pyspark
print(pyspark.__file__)
print(pyspark.__version__) # ***


E:\Apache Spark\spark-3.5.3-bin-hadoop3\python\pyspark\__init__.py
3.5.3


In [3]:
import sys
print(sys.path) # ***

['E:\\Apache Spark\\spark-3.5.3-bin-hadoop3\\python', 'E:\\Apache Spark\\spark-temp\\spark-a3c396c4-3a4e-411c-ac0e-9d419e391043\\userFiles-f7b3aeca-373d-4f88-ab78-490bcc3ea26d', 'E:\\Apache Spark\\spark-3.5.3-bin-hadoop3\\python\\lib\\py4j-0.10.9.7-src.zip', 'e:\\GithubRepository(E)\\coding-learning\\books-recommender-system-using-spark', 'e:\\Python\\Python311\\python311.zip', 'e:\\Python\\Python311\\DLLs', 'e:\\Python\\Python311\\Lib', 'e:\\Python\\Python311', '', 'C:\\Users\\MECHREVO\\AppData\\Roaming\\Python\\Python311\\site-packages', 'C:\\Users\\MECHREVO\\AppData\\Roaming\\Python\\Python311\\site-packages\\win32', 'C:\\Users\\MECHREVO\\AppData\\Roaming\\Python\\Python311\\site-packages\\win32\\lib', 'C:\\Users\\MECHREVO\\AppData\\Roaming\\Python\\Python311\\site-packages\\Pythonwin', 'e:\\Python\\Python311\\Lib\\site-packages']


In [4]:
import os
print(os.environ.get("SPARK_HOME"))
print(os.environ.get("PYTHONPATH")) # ***

E:\Apache Spark\spark-3.5.3-bin-hadoop3
E:\Apache Spark\spark-3.5.3-bin-hadoop3\python;E:\Apache Spark\spark-3.5.3-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip;;E:\Apache Spark\spark-3.5.3-bin-hadoop3\python;E:\Apache Spark\spark-3.5.3-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip;


In [5]:
spark

In [6]:
# Read the CSV file with specified options
books = spark.read \
    .format("csv") \
    .option("sep", ";") \
    .option("header", "true") \
    .option("encoding", "latin1") \
    .option("mode", "DROPMALFORMED") \
    .load("data/BX-Books.csv")

In [7]:
type(books)

pyspark.sql.dataframe.DataFrame

In [8]:
books.show()

+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari

In [9]:
books.columns

['ISBN',
 'Book-Title',
 'Book-Author',
 'Year-Of-Publication',
 'Publisher',
 'Image-URL-S',
 'Image-URL-M',
 'Image-URL-L']

In [10]:
books.show()

+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari

In [11]:
rename_dict = {
    "Book-Title": "title",
    "Book-Author": "author",
    "Year-Of-Publication": "year",
    "Publisher": "publisher",
    "Image-URL-L": "image_url"
}

for old_name, new_name in rename_dict.items():
    books = books.withColumnRenamed(old_name, new_name)

In [12]:
books = books.select('ISBN', 'title', 'author', 'year', 'publisher', 'image_url')

In [13]:
books.show()

+----------+--------------------+--------------------+----+--------------------+--------------------+
|      ISBN|               title|              author|year|           publisher|           image_url|
+----------+--------------------+--------------------+----+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|2002|Oxford University...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|2001|HarperFlamingo Ca...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|1991|     HarperPerennial|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari Kolata|1999|Farrar Straus Giroux|http://images.ama...|
|0393045218|The Mummies of Ur...|     E. J. W. Barber|1999|W. W. Norton &amp...|http://images.ama...|
|0399135782|The Kitchen God's...|             Amy Tan|1991|    Putnam Pub Group|http://images.ama...|
|0425176428|What If?: The Wor...|       Robert Cowley|2000|Berkley Publishin...|ht

In [14]:
num_rows = books.count()
num_cols = len(books.columns)
print(num_rows, num_cols)

271379 6


In [15]:
# read the User csv file
users = spark.read \
    .format("csv") \
    .option("sep", ";") \
    .option("header", "true") \
    .option("encoding", "latin1") \
    .option("mode", "DROPMALFORMED") \
    .load("data/BX-Users.csv")

In [16]:
users.show()

+-------+--------------------+----+
|User-ID|            Location| Age|
+-------+--------------------+----+
|      1|  nyc, new york, usa|NULL|
|      2|stockton, califor...|  18|
|      3|moscow, yukon ter...|NULL|
|      4|porto, v.n.gaia, ...|  17|
|      5|farnborough, hant...|NULL|
|      6|santa monica, cal...|  61|
|      7| washington, dc, usa|NULL|
|      8|timmins, ontario,...|NULL|
|      9|germantown, tenne...|NULL|
|     10|albacete, wiscons...|  26|
|     11|melbourne, victor...|  14|
|     12|fort bragg, calif...|NULL|
|     13|barcelona, barcel...|  26|
|     14|mediapolis, iowa,...|NULL|
|     15|calgary, alberta,...|NULL|
|     16|albuquerque, new ...|NULL|
|     17|chesapeake, virgi...|NULL|
|     18|rio de janeiro, r...|  25|
|     19|           weston, ,|  14|
|     20|langhorne, pennsy...|  19|
+-------+--------------------+----+
only showing top 20 rows



In [17]:
rename_dict = {
    "User-ID": "user_id",
    "Location": "location",
    "Age": "age",
}


for old_name, new_name in rename_dict.items():
    users = users.withColumnRenamed(old_name, new_name)

In [18]:
users.show(2)

+-------+--------------------+----+
|user_id|            location| age|
+-------+--------------------+----+
|      1|  nyc, new york, usa|NULL|
|      2|stockton, califor...|  18|
+-------+--------------------+----+
only showing top 2 rows



In [19]:
# Read the Ratings CSV file
ratings = spark.read \
    .format("csv") \
    .option("sep", ";") \
    .option("header", "true") \
    .option("encoding", "latin1") \
    .option("mode", "DROPMALFORMED") \
    .load("data/BX-Book-Ratings.csv")

In [20]:
ratings.show(5)

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276725|034545104X|          0|
| 276726|0155061224|          5|
| 276727|0446520802|          0|
| 276729|052165615X|          3|
| 276729|0521795028|          6|
+-------+----------+-----------+
only showing top 5 rows



In [21]:
num_rows = ratings.count()
num_cols = len(ratings.columns)
print(num_rows, num_cols)  # the same as ratings.shape

1149780 3


In [22]:
rename_dict = {
    "User-ID": "user_id",
    "Book-Rating": "rating"
}


for old_name, new_name in rename_dict.items():
    ratings = ratings.withColumnRenamed(old_name, new_name)

In [23]:
ratings.show(5)

+-------+----------+------+
|user_id|      ISBN|rating|
+-------+----------+------+
| 276725|034545104X|     0|
| 276726|0155061224|     5|
| 276727|0446520802|     0|
| 276729|052165615X|     3|
| 276729|0521795028|     6|
+-------+----------+------+
only showing top 5 rows



In [24]:
from pyspark.sql.functions import col

ratings.groupBy("user_id") \
       .count() \
       .withColumnRenamed("count", "rating_count") \
       .orderBy(col("rating_count"), ascending=False) \
       .show()


+-------+------------+
|user_id|rating_count|
+-------+------------+
|  11676|       13602|
| 198711|        7550|
| 153662|        6109|
|  98391|        5891|
|  35859|        5850|
| 212898|        4785|
| 278418|        4533|
|  76352|        3367|
| 110973|        3100|
| 235105|        3067|
| 230522|        2991|
|  16795|        2948|
| 234623|        2674|
|  36836|        2529|
|  52584|        2512|
| 245963|        2507|
| 204864|        2504|
|  55492|        2459|
| 185233|        2448|
| 171118|        2421|
+-------+------------+
only showing top 20 rows



In [25]:
ratings.groupBy("user_id").count().orderBy("count", ascending=False).show() # get every user's ratings for the books, sorted in descending order

+-------+-----+
|user_id|count|
+-------+-----+
|  11676|13602|
| 198711| 7550|
| 153662| 6109|
|  98391| 5891|
|  35859| 5850|
| 212898| 4785|
| 278418| 4533|
|  76352| 3367|
| 110973| 3100|
| 235105| 3067|
| 230522| 2991|
|  16795| 2948|
| 234623| 2674|
|  36836| 2529|
|  52584| 2512|
| 245963| 2507|
| 204864| 2504|
|  55492| 2459|
| 185233| 2448|
| 171118| 2421|
+-------+-----+
only showing top 20 rows



In [26]:
ratings_counts = ratings.groupBy("user_id").count()

In [27]:
ratings_counts = ratings_counts.filter(ratings_counts["count"] > 200)

In [28]:
ratings_counts.show(5)

+-------+-----+
|user_id|count|
+-------+-----+
|  32773|  745|
|  31391|  208|
|  26593|  272|
|  28360|  247|
|  19085|  240|
+-------+-----+
only showing top 5 rows



In [29]:
user_ids = ratings_counts.select("user_id")

user_id_list = [row['user_id'] for row in user_ids.collect()]

In [30]:
user_id_list

['32773',
 '31391',
 '26593',
 '28360',
 '19085',
 '29855',
 '28591',
 '31826',
 '16966',
 '23872',
 '30511',
 '8681',
 '26544',
 '25981',
 '10447',
 '25601',
 '23902',
 '13552',
 '30509',
 '2977',
 '6251',
 '21576',
 '7915',
 '8936',
 '31846',
 '18067',
 '19664',
 '11601',
 '14422',
 '30711',
 '2766',
 '35836',
 '16916',
 '11993',
 '32721',
 '8245',
 '6575',
 '8890',
 '16106',
 '16795',
 '21252',
 '24194',
 '13850',
 '32195',
 '30533',
 '3757',
 '15408',
 '20201',
 '35050',
 '10819',
 '35433',
 '23768',
 '4385',
 '30735',
 '12982',
 '16634',
 '31315',
 '11676',
 '277639',
 '20115',
 '12538',
 '23288',
 '21659',
 '28204',
 '25409',
 '7346',
 '8067',
 '6543',
 '32440',
 '277427',
 '4017',
 '13273',
 '17950',
 '20859',
 '24921',
 '7158',
 '277478',
 '6242',
 '26516',
 '33974',
 '27647',
 '14521',
 '6323',
 '35859',
 '15957',
 '30972',
 '18401',
 '22625',
 '26583',
 '28634',
 '30276',
 '21014',
 '35857',
 '3363',
 '33145',
 '6563',
 '28523',
 '278418',
 '31556',
 '26535',
 '254',
 '7286',

In [31]:
# filters the ratings DataFrame to include only rows with user IDs that are in y—that is, 
# only ratings by users with more than 200 ratings.
ratings = ratings.filter(ratings.user_id.isin(user_id_list))

In [32]:
ratings.show(5)
print(ratings.count(), len(ratings.columns))

+-------+----------+------+
|user_id|      ISBN|rating|
+-------+----------+------+
| 277427|002542730X|    10|
| 277427|0026217457|     0|
| 277427|003008685X|     8|
| 277427|0030615321|     0|
| 277427|0060002050|     0|
+-------+----------+------+
only showing top 5 rows

526356 3


In [33]:
# join the ratings with books
ratings_with_books = ratings.join(books, on="ISBN", how="inner")

In [34]:
ratings_with_books.show(5)

+----------+-------+------+--------------------+----------------+----+--------------------+--------------------+
|      ISBN|user_id|rating|               title|          author|year|           publisher|           image_url|
+----------+-------+------+--------------------+----------------+----+--------------------+--------------------+
|0001048473|  23902|     0|Nothing Can Be Be...|Barns &amp; Budd|1996|            Atlantic|http://images.ama...|
|0001382381|  26583|     0|Huck Scarry's Ste...|     Huck Scarry|1979|HarperCollins Pub...|http://images.ama...|
|0001848445|  11676|     0| THE COAL HOUSE T/PB|   Andrew Taylor|1986|HarperCollins Pub...|http://images.ama...|
|0001848445| 131402|     0| THE COAL HOUSE T/PB|   Andrew Taylor|1986|HarperCollins Pub...|http://images.ama...|
|0001900277|  11676|     0|Glue (First Facts...|   Harriet Hains|1989|HarperCollins Pub...|http://images.ama...|
+----------+-------+------+--------------------+----------------+----+--------------------+-----

In [35]:
print(ratings_with_books.count(), len(ratings_with_books.columns))

487685 8


In [36]:
from pyspark.sql.functions import count

# Group by 'title' and count the number of ratings for each title.
# In Spark, the result is already a DataFrame with columns (no need for a separate reset_index() as in Pandas).
number_rating = ratings_with_books.groupBy("title").agg(count("rating").alias("rating_num"))

In [37]:
number_rating.show(5)

+--------------------+----------+
|               title|rating_num|
+--------------------+----------+
|Survival Guide to...|         1|
|     In the Clearing|         1|
|Too Many Men : A ...|         4|
|Emily Post's Wedd...|         1|
|Speculations: The...|         1|
+--------------------+----------+
only showing top 5 rows



In [38]:
print(number_rating.count(), len(number_rating.columns))

160280 2


In [39]:
ratings = ratings_with_books.join(number_rating, on="title", how="inner")

In [40]:
ratings.show(5)

+--------------------+----------+-------+------+-----------+----+--------------------+--------------------+----------+
|               title|      ISBN|user_id|rating|     author|year|           publisher|           image_url|rating_num|
+--------------------+----------+-------+------+-----------+----+--------------------+--------------------+----------+
| A Light in the S...|0590567330|  35859|     0|Karen Hesse|1999|Hyperion Books fo...|http://images.ama...|         2|
| A Light in the S...|0590567330|  96448|     9|Karen Hesse|1999|Hyperion Books fo...|http://images.ama...|         2|
| Q-Space (Star Tr...|0671019155|  12538|     0|   Greg Cox|1998|           Star Trek|http://images.ama...|        12|
| Q-Space (Star Tr...|0671019155|  30276|     0|   Greg Cox|1998|           Star Trek|http://images.ama...|        12|
| Q-Space (Star Tr...|0671019155|  35859|     0|   Greg Cox|1998|           Star Trek|http://images.ama...|        12|
+--------------------+----------+-------+------+

In [41]:
print(ratings.count(), len(ratings.columns))

487685 9


In [42]:
ratings = ratings.filter(ratings["rating_num"] >= 50)

In [43]:
ratings.show(5)

+--------------------+----------+-------+------+---------+----+---------+--------------------+----------+
|               title|      ISBN|user_id|rating|   author|year|publisher|           image_url|rating_num|
+--------------------+----------+-------+------+---------+----+---------+--------------------+----------+
|A Thin Dark Line ...|0553571885|   6323|     0|TAMI HOAG|1998|   Bantam|http://images.ama...|        61|
|A Thin Dark Line ...|0553571885|   7915|     0|TAMI HOAG|1998|   Bantam|http://images.ama...|        61|
|A Thin Dark Line ...|0553571885|  11601|     0|TAMI HOAG|1998|   Bantam|http://images.ama...|        61|
|A Thin Dark Line ...|0553571885|  28204|     0|TAMI HOAG|1998|   Bantam|http://images.ama...|        61|
|A Thin Dark Line ...|0553571885|  29259|     0|TAMI HOAG|1998|   Bantam|http://images.ama...|        61|
+--------------------+----------+-------+------+---------+----+---------+--------------------+----------+
only showing top 5 rows



In [44]:
print(ratings.count(), len(ratings.columns))

61853 9


In [45]:
ratings = ratings.dropDuplicates(['user_id', 'title']) # drop the duplicates with the same user_id / title

In [46]:
print(ratings.count(), len(ratings.columns))

59850 9


In [47]:
from pyspark.sql.functions import avg

# Pivot the table with user_id as columns, title as rows, and average rating as values
book_pivot = ratings.groupBy("title") \
                         .pivot("user_id") \
                         .agg(avg("rating")) \
                         .fillna(0)  # Optional: Replace nulls with 0

In [48]:
book_pivot.show(5)

+--------------------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+-----+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+-----+------+------+------+------+-----+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+--

In [86]:
from pyspark.sql.functions import monotonically_increasing_id

book_pivot = book_pivot.select("index", *[col for col in book_pivot.columns if col != "index"]) # add an index column to the first col of DataFrame

book_pivot_with_index_collected = book_pivot.collect()

In [87]:
book_pivot.show(5)

+-----+--------------------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+-----+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+-----+------+------+------+------+-----+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+------+------+------+------+-----+------+------+------+------+------+------+---

In [49]:
pivot_row = book_pivot.count()
pivot_col = len(book_pivot.columns)
print(pivot_row, pivot_col)

742 889


In [88]:
# why use Spark and its pyspark.sql.DataFrame instead of directly using Pandas? That way, we wouldn't need to convert it to a Pandas DataFrame in the code below ***
import numpy as np
from scipy.sparse import csr_matrix

# convert the Spark DataFrame to a Pandas DataFrame
book_pivot_pd = book_pivot.toPandas()

# remove the non-numeric 'title' column
# (Alternatively, if 'title' is the index, you could reset the index or extract it separately)
numeric_matrix = book_pivot_pd.drop(columns=["index", "title"]).values.astype(np.float64)

# now create the sparse matrix
book_sparse = csr_matrix(numeric_matrix)


In [89]:
type(book_sparse)

scipy.sparse._csr.csr_matrix

In [90]:
# Now import our clustering algoritm which is Nearest Neighbors this is an unsupervised ml algo
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm= 'brute')

In [91]:
model.fit(book_sparse)

In [98]:
suggested_book_index = 237
book_pivot_with_index_collected[suggested_book_index]["title"]

'All Creatures Great and Small'

In [99]:
# reshape(1, -1) ensures that the row is treated as a single data point (as a 2D array with one row and multiple columns). 
# reshape(1, -1) function is used to convert this 1D array into a 2D array with one row and multiple columns (features).
# for example: [1, 2, 3, 4, 5] --> [[1, 2, 3, 4, 5]]

row = book_pivot.collect()[suggested_book_index] # this gets the 237th row from the DataFrame

row_values = np.array(row[2:]) # the first col is index, the second row is title, we don't need it

distance, suggestion = model.kneighbors(row_values.reshape(1,-1), n_neighbors=6 )


In [100]:
distance

array([[ 0.        , 30.98386677, 32.52691193, 32.77193922, 33.07567082,
        33.76388603]])

In [101]:
suggestion

array([[237, 583, 353, 466, 130, 568]])

In [102]:
for i in range(len(suggestion[0])):
    index = int(suggestion[0][i]) # convert NumPy array value to integer
    print(book_pivot_with_index_collected[index]['title'])

All Creatures Great and Small
Exclusive
Jacob Have I Loved
It Was on Fire When I Lay Down on It
No Safe Place
Hearts in Atlantis
