In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky


In [None]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [None]:
spark = SparkSession \
            .builder \
            .master("local[*]")\
            .appName("New-Spark") \
            .config("spark.memory.fraction", 0.8) \
            .config("spark.executor.memory", "10g") \
            .config("spark.driver.memory", "10g")\
            .config("spark.sql.shuffle.partitions" , "800") \
            .config("spark.memory.offHeap.enabled",'true')\
            .config("spark.memory.offHeap.size","10g")\
            .config("spark.executor.cores", "4")\
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
            .config("spark.default.parallelism", "4")\
            .getOrCreate()
spark

Đọc data, đổi tên columns mặc định

In [None]:
df = spark.read.csv('data/ratings_Office_Products.csv', inferSchema=True, header=False)
df = df.withColumnRenamed('_c1', 'product_id')
df = df.withColumnRenamed('_c0', 'user_id')
df = df.withColumnRenamed('_c2', 'label')
df = df.select('product_id', 'user_id', 'label')
df.show(5)

+----------+--------------+-----+
|product_id|       user_id|label|
+----------+--------------+-----+
|0078800242|A2UESEUCI73CBO|  5.0|
|0113000316|A3BBNK2R5TUYGV|  5.0|
|0113000316| A5J78T14FJ5DU|  3.0|
|043928631X|A2P462UH5L6T57|  5.0|
|0439340039|A2E0X1MWNRTQF4|  1.0|
+----------+--------------+-----+
only showing top 5 rows



In [None]:
df.count()

1243186

In [None]:
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- label: double (nullable = true)



In [None]:
n_product = df.select('product_id').distinct().count()
n_user = df.select('user_id').distinct().count()
print(n_product, n_user)

130006 909314


In [None]:
# How big the matrix
n_product * n_user

118216275884

Pre-processing convert user và product sang index

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.pipeline import Pipeline

indexer_product = StringIndexer(inputCol='product_id', outputCol='product_idx')
indexer_user = StringIndexer(inputCol='user_id', outputCol='user_idx')

pre_pipeline = Pipeline(stages=[indexer_product, indexer_user])
pre_pipeline_fitted = pre_pipeline.fit(df)
final_df = pre_pipeline_fitted.transform(df)

train_df, test_df = final_df.randomSplit([0.9, 0.1], seed=42)
final_df.show(5)

+----------+--------------+-----+-----------+--------+
|product_id|       user_id|label|product_idx|user_idx|
+----------+--------------+-----+-----------+--------+
|0078800242|A2UESEUCI73CBO|  5.0|    71702.0|112059.0|
|0113000316|A3BBNK2R5TUYGV|  5.0|    51103.0|621354.0|
|0113000316| A5J78T14FJ5DU|  3.0|    51103.0| 26236.0|
|043928631X|A2P462UH5L6T57|  5.0|    71703.0|500720.0|
|0439340039|A2E0X1MWNRTQF4|  1.0|    71704.0|440389.0|
+----------+--------------+-----+-----------+--------+
only showing top 5 rows



Build Recommendation model

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from sklearn.metrics import mean_squared_error
import time
tic = time.time()

als = ALS(maxIter=15,           # Number of iterations
          regParam=0.1,        # Regularization parameter beta
          rank=25,              # Number of features
          numItemBlocks=10,     # Number partitioned to parallelize computation
          alpha=0.05,            # Learning rate
          coldStartStrategy='drop',
          nonnegative=True,
          userCol='user_idx',     
          itemCol='product_idx',
          ratingCol='label')
model = als.fit(train_df)

predictions = model.transform(test_df)
# predictions.show(5)
predictions.cache()

evaluator = RegressionEvaluator(metricName='rmse')
rmse = evaluator.evaluate(predictions)
print('RMSE: {:.4f}'.format(rmse))

toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))

RMSE: 1.7812
Total time: 1246.19 seconds


Tạo Recommendations cho 3 users

In [None]:
user_list = ['A00473363TJ8YSZ3YAGG9', 'A335QXPTV1RIV1', 'ATIMW8SYGAASW']
user_df = pd.DataFrame(user_list, columns=['user_id'])
user_df = spark.createDataFrame(user_df)
user_df = pre_pipeline_fitted.stages[1].transform(user_df).select('user_idx')
user_recom = model.recommendForUserSubset(user_df, 5)
user_recom.show()

+--------+--------------------+
|user_idx|     recommendations|
+--------+--------------------+
|    2602|[{87758, 7.368137...|
|    2132|[{94647, 6.631931...|
|   10981|[{87758, 6.271669...|
+--------+--------------------+



Viết functions convert user và recommendation_product

In [13]:
product_label = pre_pipeline_fitted.stages[0].labels
user_label = pre_pipeline_fitted.stages[1].labels

def convert_product(list_tuple, product_label):
    converted = []
    for product_idx, rating in list_tuple:
        converted.append((product_label[product_idx], rating))
    return converted

convert_recom = udf(lambda x: convert_product(x, product_label), ArrayType(ArrayType(StringType())))
user_recom = user_recom.withColumn('recommendation_product', convert_recom('recommendations'))

convert_user = udf(lambda x: user_label[x], StringType())
user_recom = user_recom.withColumn('user_id', convert_user('user_idx'))

user_recom_by_id = user_recom.select('user_id', 'recommendation_product')
user_recom_by_id.toPandas()

Unnamed: 0,user_id,recommendation_product
0,ATIMW8SYGAASW,"[[B002N93HB0, 7.368137359619141], [B001S4HW44,..."
1,A335QXPTV1RIV1,"[[B0040QV26Q, 6.631930828094482], [B007X9SIP0,..."
2,A00473363TJ8YSZ3YAGG9,"[[B002N93HB0, 6.271668910980225], [B001GS0FZK,..."


Nhận xét:
- Model cho test-rmse 1.7, kết quả này là chưa được tối ưu (tốt nhất là <1), nếu có nhiều thời gian hơn thì có thể lấy sample 10% và tuning parameter cho ALS, tuy nhiên thời gian chạy vẫn sẽ rất lâu.
- Bên cạnh đó, thời gian chạy recommendation rất lâu, khoảng tầm 1 tiếng, điều này sẽ không thể sử dụng được trong thực tế do cần yêu cầu recommendation dưới 1-2 seconds
- Bên cạnh đó thời gian convert sang tên cũng rất lâu, hơn 1h30 phút.
- Code convert đã được viết lại sao cho dễ nhìn và đồng thời tận dụng hết được paralell computing, tuy nhiên tốc độ vẫn rất chậm.
- Trong thực tế có thể cần sử dụng technique khác để tăng tốc độ recomendation tốt hơn cho bài toán, cũng như convert ra kết quả cuối cùng cho người sử dụng.
- 1 điểm cần lưu ý thêm là rating đưa ra bởi model recomendation có 1 vài product > 5.