In this notebook, we have successfully developed a robust recommendation system ,using **Alternating Least Squares (ALS)** method, tailored specifically for Amazon users. By analyzing their activity and preferences, our system effectively suggests five highly relevant products for each individual customer.

# Imports and installations

In [32]:
!pip install pyspark



In [55]:
from google.colab import drive
drive.mount('/content/drive')
METADATA_PATH = '/content/drive/Shareddrives/CIE/BigData_Final_Project/Data/meta/meta_Cell_Phones_and_Accessories.json'
REVIEWS_PATH = '/content/drive/Shareddrives/CIE/BigData_Final_Project/Data/reviews/Cell_Phones_and_Accessories.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [35]:
spark = SparkSession.builder.appName('Suggestion').getOrCreate() # creating spark session for the application

# Data loading and preprocessing

In [56]:
# Read the input data as DataFrame
meta = spark.read.json(METADATA_PATH).drop("fit", "imageURL", "imageURLHighRes", )
reviews = spark.read.json(REVIEWS_PATH).drop("image")
ids = ['reviewerID', 'asin']
# Get a dataframe consisting only of reviews that are duplicated
print('duplicate reviews dataframe')
reviews \
    .groupby(ids) \
    .count() \
    .where('count > 1') \
    .sort('count', ascending=False) \
    .show()
# Drop the duplicate reviews
reviews = reviews.dropDuplicates(subset=ids)
print("metadata dataframe")
meta.show()
print("reviews dataframe")
reviews.show()

duplicate reviews dataframe
+--------------+----------+-----+
|    reviewerID|      asin|count|
+--------------+----------+-----+
|A2RDOT92PMAX45|B017XC69FM|   49|
|A2KRYQY306EGN4|B00SVY5872|   20|
| A8Z3WD5ZWBOO7|B0085R2F3K|   18|
|A1CCZ468PP35LT|B009AN211O|   15|
|A150I4QWXT8JPO|B004MF3IA8|   11|
|A22ICJNN4QESPR|B000VUGZIK|   10|
|A33775AIB1A664|B008U0XSY2|   10|
|A1YNGJAT1TMFZO|B009PYUOTO|   10|
| AI380SRCQDHIJ|B001HDDMC2|    9|
|A2BXP3G6NI6NXQ|B00SVY5872|    8|
|A33TED651NAXWU|B0006TIA8Y|    8|
|A1E2L0XMC1BYQO|B00O3X6BIA|    8|
|A33BZ5LG42U685|B01FGIIOYE|    8|
|A365BJAGU86XY8|B00IZ1XJ3Q|    7|
|A1LA4K5JF78BER|B00Z7S3804|    7|
|A20IMBRGCWEV9R|B005F9XNN2|    7|
|A33BZ5LG42U685|B01GF5D0TQ|    7|
| A12LH2100CKQO|B004MF3IA8|    7|
|A31P3YO3OT6XKT|B00IZ1XJ3Q|    6|
|A13LXFPP36COEQ|B004MF3IA8|    6|
+--------------+----------+-----+
only showing top 20 rows

metadata dataframe
+--------+---------+----------+--------------------+--------------------+------------------+-------------------

In [47]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, explode


In [57]:
reviews = reviews.select("reviewerID", "asin", "overall")                         # select only the needed columns
reviewers = reviews.select('reviewerID').distinct()                               # create a dataframe of the reviewers
#reviewers = reviewers.withColumn("reviewerIDint",monotonically_increasing_id())   # add column with int ids for each reviewer
reviewers = reviewers.withColumn("reviewerIDint", F.dense_rank().over(Window.orderBy("reviewerID")))
products = meta.select('asin').distinct()                                         # create dataframe with all products
#products = products.withColumn("asin_int",monotonically_increasing_id())          # add column with int ids for each product
products = products.withColumn("asin_int", F.dense_rank().over(Window.orderBy("asin")))
print('reviewers dataframe')
reviewers.show()
print('products dataframe')
products.show()


# reviews = reviews.withColumn("reviewerID", F.dense_rank().over(Window.orderBy("reviewerID")))
# reviews = reviews.withColumn("asin", F.dense_rank().over(Window.orderBy("asin")))
# reviews = reviews.\
#      withColumn('asin', col('asin').cast('integer')).\
#      withColumn('overall', col('overall').cast('float'))

reviewers dataframe
+--------------------+-------------+
|          reviewerID|reviewerIDint|
+--------------------+-------------+
|A0000148KSJ81F2E3O7V|            1|
|A000033826RVJH496D4A|            2|
|A0000378ZNUHTQUDNNHR|            3|
|A00007664HEMMTK5IAWX|            4|
|A00015228CUPGPF957DS|            5|
|A0001528BGUBOEVR6T5U|            6|
|A0001598OL7FAN6XNMK9|            7|
|A0001624SO9RHBP4B4SE|            8|
|A0001624UKLQG4OFIM8X|            9|
|A0001810QB3WX1VF3K6H|           10|
|A0002246TPEHO0ED1SKA|           11|
|A0002686ZNGLCW7JVER2|           12|
|A00026909S42JM8RNF9T|           13|
|A0002708WFPIPQT73GK8|           14|
|A0002730WOKVUCGRLYJU|           15|
|A000310449IR1VF4LABL|           16|
|A0003196YPXUL31F0EFS|           17|
|A0003214FKMKJE0PCW3D|           18|
|A00032921HLX2KJJVXRS|           19|
|A0003396AMQHRAN91T1G|           20|
+--------------------+-------------+
only showing top 20 rows

products dataframe
+----------+--------+
|      asin|asin_int|
+--

In [58]:
meta = meta.join(products, on='asin').drop('asin')                                      # replace the string asin with int asin in metadata
reviews = reviews.join(reviewers, reviews.reviewerID ==  reviewers.reviewerID,"inner")  # replace the string reviewerID with int one in reviews
reviews = reviews.join(products, reviews.asin ==  products.asin,"inner")                # replace the string asin with int asin in reviews
reviews = reviews.select("reviewerIDint", "asin_int", "overall")                        # select the needed columns only
reviews.show()                                                                          # the final reviews data

+-------------+--------+-------+
|reviewerIDint|asin_int|overall|
+-------------+--------+-------+
|      5749553|       1|    5.0|
|      1212430|       2|    5.0|
|      2794402|       3|    1.0|
|      5841992|       5|    5.0|
|      3769313|       9|    3.0|
|      3159937|       9|    5.0|
|       976430|      13|    5.0|
|      3228516|      13|    3.0|
|       738641|      13|    5.0|
|      5088176|      14|    4.0|
|      5664702|      14|    2.0|
|       198649|      14|    4.0|
|      1475692|      21|    3.0|
|      4895155|      21|    1.0|
|      5022333|      21|    5.0|
|      5310714|      21|    4.0|
|       144857|      21|    3.0|
|      1722654|      21|    5.0|
|      1751116|      21|    5.0|
|      3350977|      21|    5.0|
+-------------+--------+-------+
only showing top 20 rows



In [61]:
# count the number of reviews of each customer
userId_ratings = reviews.groupBy("reviewerIDint").count().orderBy('count', ascending=False)
print('number of reviews per customer')
userId_ratings.show()
# count the number of reviews on each product
product_ratings = reviews.groupBy("asin_int").count().orderBy('count', ascending=False)
print('number of reviews per product')
product_ratings.show()

number of reviews per customer
+-------------+-----+
|reviewerIDint|count|
+-------------+-----+
|      4078474|  445|
|       713753|  321|
|      2637651|  273|
|      4306451|  214|
|      3188055|  205|
|      4531190|  201|
|      3783589|  192|
|      1350975|  173|
|      1614049|  164|
|      2734786|  150|
|      5089781|  149|
|      5937255|  148|
|      3052200|  147|
|      1750438|  141|
|      1508309|  138|
|        64633|  138|
|      3418971|  135|
|      3385526|  133|
|       405501|  132|
|      4257604|  130|
+-------------+-----+
only showing top 20 rows

number of reviews per product
+--------+-----+
|asin_int|count|
+--------+-----+
|  291010|13543|
|   63142|13236|
|  401980|13031|
|  343536|11227|
|  453402|10728|
|  121580|10132|
|  293405|10100|
|  202643| 9282|
|  385484| 9038|
|  497913| 8890|
|  326946| 8829|
|  142966| 8760|
|  103314| 8525|
|  503107| 8307|
|  299140| 8263|
|  162471| 7992|
|  247542| 7827|
|   94377| 7417|
|   65648| 7091|
|  530657| 

# Model training

In [62]:
# Split the data into train and test with ratios 80:20%
(train, test) = reviews.randomSplit([0.8, 0.2], seed = 2022)

# Create ALS model
als = ALS(maxIter=5,
          regParam=0.01,
          userCol="reviewerIDint", 
          itemCol="asin_int", 
          ratingCol="overall", 
          nonnegative = True, 
          implicitPrefs = False, 
          coldStartStrategy="drop")
model = als.fit(train)                                                                              # training the model on train data
predictions = model.transform(test)                                                                 # predict on the test data
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall", predictionCol="prediction")  # create evaluator instance
rmse = evaluator.evaluate(predictions)                                                              # evaluate the model
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 5.914998678702433


In [63]:
# Show the predictions
predictions.show()

+-------------+--------+-------+----------+
|reviewerIDint|asin_int|overall|prediction|
+-------------+--------+-------+----------+
|       903127|      15|    5.0| 1.0319918|
|      1342734|      15|    5.0|  4.311965|
|      2565362|      15|    5.0| 1.1018164|
|      2663381|      15|    3.0| 11.158386|
|      3290077|      15|    5.0| 0.9008183|
|      3658486|      15|    2.0| 0.7995701|
|      3989808|      15|    4.0| 1.2987003|
|      4109756|      15|    4.0|  4.844903|
|      4250327|      15|    3.0| 1.2805924|
|      4591625|      15|    4.0| 0.5180634|
|      4819754|      15|    5.0| 0.7426771|
|      4964271|      15|    3.0| 1.0034156|
|      5118714|      15|    3.0|  4.332898|
|      5378424|      15|    5.0| 1.6532984|
|      6026523|      15|    3.0| 0.6760611|
|      6093974|      15|    5.0| 2.6550043|
|       938025|      15|    1.0| 2.3123302|
|      1007359|      15|    2.0|  10.51034|
|      1226268|      15|    5.0|0.60709566|
|      1696490|      15|    3.0|

# Using the trained model to recommend prducts for customers

In [None]:
# Recommend 5 products for each customer
nrecommendations = model.recommendForAllUsers(5)
nrecommendations.limit(5).show()



In [None]:
# Re-organize the recommendations for visualization
nrecommendations = nrecommendations.withColumn("rec_exp", explode("recommendations")).select("reviewerIDint","rec_exp.asin_int","rec_exp.rating")
nrecommendations.limit(20).show()

In [None]:
# Show a smaple of the recommended products for a random customer
nrecommendations.join(meta, on='asin_int').filter('reviewerIDint = 6').show()

In [None]:
# showing the reviews of that customer
reviews.join(meta, on='asin_int').filter('reviewerIDint = 6').sort('overall', ascending=False).limit(10).show()