In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7


In [3]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [4]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [5]:
# df = spark.read.csv("ratings_Musical_Instruments.csv", header=False, inferSchema=True)
df = spark.read.json("Musical_Instruments_5.json")

df = df.select('asin', 'overall', 'reviewerID')
df = df.withColumnRenamed('asin', 'product_id')
df = df.withColumnRenamed('reviewerID', 'user_id')
df = df.withColumnRenamed('overall', 'label')

df.show(5)

+----------+-----+--------------+
|product_id|label|       user_id|
+----------+-----+--------------+
|1384719342|  5.0|A2IBPI20UZIR0U|
|1384719342|  5.0|A14VAT5EAX3D9S|
|1384719342|  5.0|A195EZSQDW3E21|
|1384719342|  5.0|A2C00NNG1ZQQG2|
|1384719342|  5.0| A94QU4C90B1AX|
+----------+-----+--------------+
only showing top 5 rows



In [6]:
df.count()

10261

In [7]:
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- label: double (nullable = true)
 |-- user_id: string (nullable = true)



In [8]:
n_product = df.select('product_id').distinct().count()
n_user = df.select('user_id').distinct().count()
print(n_product, n_user)

900 1429


In [9]:
# How big the matrix
n_product * n_user

1286100

In [10]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.pipeline import Pipeline

indexer_product = StringIndexer(inputCol='product_id', outputCol='product_idx')
indexer_user = StringIndexer(inputCol='user_id', outputCol='user_idx')

pre_pipeline = Pipeline(stages=[indexer_product, indexer_user])
pre_pipeline_fitted = pre_pipeline.fit(df)
final_df = pre_pipeline_fitted.transform(df)

final_df.show(5)

+----------+-----+--------------+-----------+--------+
|product_id|label|       user_id|product_idx|user_idx|
+----------+-----+--------------+-----------+--------+
|1384719342|  5.0|A2IBPI20UZIR0U|      703.0|    66.0|
|1384719342|  5.0|A14VAT5EAX3D9S|      703.0|   266.0|
|1384719342|  5.0|A195EZSQDW3E21|      703.0|   395.0|
|1384719342|  5.0|A2C00NNG1ZQQG2|      703.0|  1048.0|
|1384719342|  5.0| A94QU4C90B1AX|      703.0|  1311.0|
+----------+-----+--------------+-----------+--------+
only showing top 5 rows



In [11]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

als = ALS(maxIter=20,           # Number of iterations
          regParam=0.12,        # Regularization parameter beta
          rank=20,              # Number of features
          numItemBlocks=10,     # Number partitioned to parallelize computation
          alpha=0.001,            # Learning rate
          userCol='user_idx',     
          itemCol='product_idx',
          ratingCol='label')
model = als.fit(train_df)

import time
tic = time.time()
predictions = model.transform(test_df)
predictions.show(5)
evaluator = RegressionEvaluator(metricName='rmse')
rmse = evaluator.evaluate(predictions)
print('RMSE: {:.4f}'.format(rmse))

# evaluator = RegressionEvaluator(metricName='r2')
# r2 = evaluator.evaluate(predictions)
# print('R2: {:.4f}'.format(r2))
toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))

+----------+-----+--------------------+-----------+--------+----------+
|product_id|label|             user_id|product_idx|user_idx|prediction|
+----------+-----+--------------------+-----------+--------+----------+
|B000MVYOZY|  5.0|A00625243BI8W1SSZ...|      148.0|   259.0|  4.898069|
|B000MVYOZY|  4.0|       AU3GYRAKBUAEU|      148.0|  1403.0| 1.8784928|
|B0002D0CGM|  5.0|      A1EFMEZJESPF76|      463.0|   616.0|  4.423532|
|B0002E5266|  5.0|      A1OGTL9EBDTK3N|      471.0|   154.0|  3.222876|
|B0002E5266|  5.0|       AFS3FQR5JSDVJ|      471.0|    72.0| 4.4247346|
+----------+-----+--------------------+-----------+--------+----------+
only showing top 5 rows

RMSE: 1.0825
Total time: 24.21 seconds


In [31]:
user_recom = model.recommendForAllUsers(5)
for user in user_recommendation.head(5):
    print(user)
    print('')

Row(user_idx=471, recommendations=[Row(product_idx=142, rating=4.778502464294434), Row(product_idx=481, rating=4.7573323249816895), Row(product_idx=275, rating=4.756866931915283), Row(product_idx=25, rating=4.7491631507873535), Row(product_idx=117, rating=4.705659866333008)], recommendation_product=[['B0002E1O5E', '4.778502464294434'], ['B0002KZFYE', '4.7573323249816895'], ['B0002E1O7W', '4.756866931915283'], ['B0002E2XCW', '4.7491631507873535'], ['B003LTJ404', '4.705659866333008']], user_id='A2QPZKGUDE7RJ4')

Row(user_idx=1342, recommendations=[Row(product_idx=460, rating=5.135222434997559), Row(product_idx=794, rating=5.085332870483398), Row(product_idx=579, rating=5.07351541519165), Row(product_idx=811, rating=5.052153587341309), Row(product_idx=802, rating=5.050059795379639)], recommendation_product=[['B0002CZVHI', '5.135222434997559'], ['B00135HFK4', '5.085332870483398'], ['B0002E4Z8M', '5.07351541519165'], ['B001L8KE06', '5.052153587341309'], ['B001CJ2QZU', '5.050059795379639']],

In [16]:
product_label = pre_pipeline_fitted.stages[0].labels
user_label = pre_pipeline_fitted.stages[1].labels

In [33]:
def convert_product(list_tuple, product_label):
    converted = []
    for product_idx, rating in list_tuple:
        converted.append((product_label[product_idx], rating))
    return converted

convert_recom = udf(lambda x: convert_product(x, product_label), ArrayType(ArrayType(StringType())))
user_recom = user_recom.withColumn('recommendation_product', convert_recom('recommendations'))

convert_user = udf(lambda x: user_label[x], StringType())
user_recom = user_recom.withColumn('user_id', convert_user('user_idx'))

user_recom_by_id = user_recom.select('user_id', 'recommendation_product')
user_recom_by_id.toPandas()

Unnamed: 0,user_id,recommendation_product
0,A2QPZKGUDE7RJ4,"[[B0002E1O5E, 4.778502464294434], [B0002KZFYE,..."
1,AG8FGAIPRBKIR,"[[B0002CZVHI, 5.135222434997559], [B00135HFK4,..."
2,A2KI91IR3RA7D0,"[[B0000AQRST, 4.859004020690918], [B0033P1O6S,..."
3,AJ03X8Z6RZZ27,"[[B002HFC6P8, 4.6573486328125], [B000PR3JEM, 4..."
4,A3AGHEETVTCBQI,"[[B000SZVYLQ, 5.173684120178223], [B00H02C9TG,..."
...,...,...
1424,A1Q94IXHEGQDV7,"[[B0002KZFYE, 5.246941566467285], [B0002E4Z8M,..."
1425,AWBZIK5JYWB5J,"[[B009E3EWPI, 4.632481098175049], [B0015RIN6U,..."
1426,A3R8XCQYDNQIAA,"[[B000VTPR08, 5.121089935302734], [B000EEHCUS,..."
1427,A2ZBSEZ211C4QW,"[[B0046A04LU, 5.138688564300537], [B0002KZFYE,..."
