In [1]:
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
# !tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
# !pip install -q findspark
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
# import findspark
# findspark.init()

In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

In [5]:
spark = SparkSession.builder.appName('Recommendation_Tiki_new').getOrCreate()

In [6]:
# Load data
reviews = spark.read.csv('Reviews.csv', inferSchema= True ,header= True)
reviews.show(5)

+-----------+----------+-----------------+------+------------------+
|customer_id|product_id|             name|rating|             title|
+-----------+----------+-----------------+------+------------------+
|     709310|  10001012| Lân Nguyễn Hoàng|     3|Ko dùng đc thẻ nhớ|
|   10701688|  10001012| Nguyễn Khánh Hòa|     5|   Cực kì hài lòng|
|   11763074|  10001012|  Toàn Phạm Khánh|     5|   Cực kì hài lòng|
|    9909549|  10001012|Nguyen Quang Minh|     5|      Rất hài lòng|
|    1827148|  10001012|      Phạm Bá Đức|     5|   Cực kì hài lòng|
+-----------+----------+-----------------+------+------------------+
only showing top 5 rows



In [7]:
reviews.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- title: string (nullable = true)



In [8]:
reviews.count()

293153

In [9]:
reviews_sub = reviews.select(['customer_id', 'product_id', 'rating'])
reviews_sub.show(5)

+-----------+----------+------+
|customer_id|product_id|rating|
+-----------+----------+------+
|     709310|  10001012|     3|
|   10701688|  10001012|     5|
|   11763074|  10001012|     5|
|    9909549|  10001012|     5|
|    1827148|  10001012|     5|
+-----------+----------+------+
only showing top 5 rows



In [10]:
reviews_sub.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- rating: integer (nullable = true)



In [11]:
# chuyển kiểu dữ liệu của customer_id và product_id về kiểu số
reviews_sub = reviews_sub.withColumn("customer_id", reviews_sub["customer_id"].cast(DoubleType()))
reviews_sub = reviews_sub.withColumn("product_id", reviews_sub["product_id"].cast(DoubleType()))
reviews_sub.printSchema()

root
 |-- customer_id: double (nullable = true)
 |-- product_id: double (nullable = true)
 |-- rating: integer (nullable = true)



In [12]:
# Check NaN Values
reviews_sub.select([count(when(isnan(c), c)).alias(c) 
                 for c in reviews_sub.columns]).toPandas().T
# dữ liệu không có giá trị NaN

Unnamed: 0,0
customer_id,0
product_id,0
rating,0


In [13]:
# Check Null Values
reviews_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in
 reviews_sub.columns]).toPandas().T

Unnamed: 0,0
customer_id,32
product_id,32
rating,62


In [14]:
# loại bỏ Null
reviews_sub = reviews_sub.dropna()

In [15]:
# Check Null Values
reviews_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in
 reviews_sub.columns]).toPandas().T
# không còn giá trị null

Unnamed: 0,0
customer_id,0
product_id,0
rating,0


In [16]:
reviews.count()

293153

In [17]:
# Distinct
customer_id = reviews_sub.select('customer_id').distinct().count()
product_id = reviews_sub.select('product_id').distinct().count()
numerator = reviews_sub.count()

In [18]:
display(numerator, customer_id, product_id)

293091

212644

4171

In [19]:
# Number of rating matrix could contain if no empty cells
denominator = customer_id * product_id
denominator

886938124

In [20]:
# calculating sparsity
sparsity = 1 - (numerator*1.0/denominator)
print('Sparsity:'), sparsity

Sparsity:


(None, 0.9996695474102768)

In [21]:
# Split the data to training and testing sets
(training, test) = reviews_sub.randomSplit([0.8, 0.2])

## Build model & Evaluation

In [22]:
als = ALS(maxIter=18, # Số lần lặp, defaults = 10
          regParam=0.4, # Mong muốn độ lỗi ít nhất có thể
          rank= 25,
          userCol='customer_id',
          itemCol='product_id',
          ratingCol='rating',
          coldStartStrategy='drop', # Không có giá trị NAN
          nonnegative=True) # Không chấp nhận đề xuất là giá trị âm
model = als.fit(training)

In [23]:
# evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [24]:
predictions.show(5)

+-----------+-----------+------+----------+
|customer_id| product_id|rating|prediction|
+-----------+-----------+------+----------+
|  1236658.0|7.1896003E7|     4| 3.0756898|
|  8057326.0|  2774881.0|     5|  4.159753|
|1.3676965E7|  2069769.0|     4|   2.55822|
|1.5262963E7|  2774881.0|     4| 3.7336826|
|1.7728182E7|  2774881.0|     5|  4.163211|
+-----------+-----------+------+----------+
only showing top 5 rows



In [25]:
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))

Root-mean-square error = 1.1519259907016253


In [26]:
# # save model 
# model.save('Tiki_Recommendater_System')

## Providing Recommendations: for all customers

In [27]:
# get 20 recommendations with have highest rating
cus_recs = model.recommendForAllUsers(5)

In [28]:
cus_recs.show(10, False)

+-----------+-------------------------------------------------------------------------------------------------------------------+
|customer_id|recommendations                                                                                                    |
+-----------+-------------------------------------------------------------------------------------------------------------------+
|471        |[{76732229, 5.126761}, {71293311, 5.1080756}, {67216147, 4.924155}, {73844240, 4.851091}, {32198166, 4.8349047}]   |
|588        |[{76732229, 4.141814}, {71293311, 4.0359387}, {67216147, 3.9531202}, {73788531, 3.914941}, {69196844, 3.902448}]   |
|808        |[{76732229, 4.008161}, {71293311, 3.991675}, {46134868, 3.8947685}, {67216147, 3.8520577}, {73830099, 3.7972593}]  |
|833        |[{76732229, 5.060788}, {71293311, 5.0395975}, {67216147, 4.870337}, {65948026, 4.791558}, {73844240, 4.7876053}]   |
|1468       |[{76732229, 2.0203059}, {71293311, 1.9951096}, {67216147, 1.9210422}, {523417

In [29]:
cus_recs.count()

177805

In [30]:
for cus in cus_recs.head(5):
    print(cus)
    print('\n')

Row(customer_id=471, recommendations=[Row(product_id=76732229, rating=5.126760959625244), Row(product_id=71293311, rating=5.1080756187438965), Row(product_id=67216147, rating=4.924155235290527), Row(product_id=73844240, rating=4.851090908050537), Row(product_id=32198166, rating=4.834904670715332)])


Row(customer_id=588, recommendations=[Row(product_id=76732229, rating=4.141814231872559), Row(product_id=71293311, rating=4.035938739776611), Row(product_id=67216147, rating=3.953120231628418), Row(product_id=73788531, rating=3.9149410724639893), Row(product_id=69196844, rating=3.9024479389190674)])


Row(customer_id=808, recommendations=[Row(product_id=76732229, rating=4.0081610679626465), Row(product_id=71293311, rating=3.9916749000549316), Row(product_id=46134868, rating=3.894768476486206), Row(product_id=67216147, rating=3.852057695388794), Row(product_id=73830099, rating=3.7972593307495117)])


Row(customer_id=833, recommendations=[Row(product_id=76732229, rating=5.060788154602051), R

In [31]:
recommendation = cus_recs.select(cus_recs.customer_id, explode(cus_recs.recommendations))
recommendation = recommendation.withColumn('product_id_rec', recommendation.col.getField("product_id"))\
                  .withColumn('rating_rec', recommendation.col.getField("rating"))

recommendation.show()

+-----------+--------------------+--------------+----------+
|customer_id|                 col|product_id_rec|rating_rec|
+-----------+--------------------+--------------+----------+
|        471|{76732229, 5.126761}|      76732229|  5.126761|
|        471|{71293311, 5.1080...|      71293311| 5.1080756|
|        471|{67216147, 4.924155}|      67216147|  4.924155|
|        471|{73844240, 4.851091}|      73844240|  4.851091|
|        471|{32198166, 4.8349...|      32198166| 4.8349047|
|        588|{76732229, 4.141814}|      76732229|  4.141814|
|        588|{71293311, 4.0359...|      71293311| 4.0359387|
|        588|{67216147, 3.9531...|      67216147| 3.9531202|
|        588|{73788531, 3.914941}|      73788531|  3.914941|
|        588|{69196844, 3.902448}|      69196844|  3.902448|
|        808|{76732229, 4.008161}|      76732229|  4.008161|
|        808|{71293311, 3.991675}|      71293311|  3.991675|
|        808|{46134868, 3.8947...|      46134868| 3.8947685|
|        808|{67216147, 

In [32]:
recommendation = recommendation.toPandas()
recommendation.head()

Unnamed: 0,customer_id,col,product_id_rec,rating_rec
0,471,"(76732229, 5.126760959625244)",76732229,5.126761
1,471,"(71293311, 5.1080756187438965)",71293311,5.108076
2,471,"(67216147, 4.924155235290527)",67216147,4.924155
3,471,"(73844240, 4.851090908050537)",73844240,4.851091
4,471,"(32198166, 4.834904670715332)",32198166,4.834905


### Kết hợp product_id với item_id của product data

In [33]:
products = pd.read_csv('Products.csv')
products.head()

Unnamed: 0,item_id,name,description,rating,price,list_price,brand,group,url,image
0,48102821,tai bluetooth inpods cảm biến vân chống màu sắ...,dung pin thời gian pin thời gian nhạc liên tục...,4.0,77000,300000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-inpods-12-cam-bien-...,https://salt.tikicdn.com/cache/280x280/ts/prod...
1,52333193,tai bluetooth không dây true wireless dock sạc...,dung pin thời gian pin thời gian nhạc thời gia...,4.5,132000,750000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-khong-day-f9-true-w...,https://salt.tikicdn.com/cache/280x280/ts/prod...
2,299461,chuột không dây logitech silent hàng hãng,logitech độ phân giải quang học model lưu kéo ...,4.8,299000,399000,Logitech,Thiết Bị Số - Phụ Kiện Số/Phụ kiện máy tính và...,https://chuot-khong-day-logitech-m331-silent-p...,https://salt.tikicdn.com/cache/280x280/media/c...
3,57440329,loa bluetooth kiêm đồng hồ báo thức robot hàng...,acome indonesia trung quốc sku indonesia tình ...,4.7,149000,350000,Acome,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://loa-bluetooth-5-0-kiem-dong-ho-bao-thu...,https://salt.tikicdn.com/cache/280x280/ts/prod...
4,38458616,tai bluetooth apple airpods pro true wireless ...,apple mỹ tai mm hộp sạc mm model tai g hộp sạc...,4.8,5090000,8500000,Apple,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-apple-airpods-pro-t...,https://salt.tikicdn.com/cache/280x280/ts/prod...


In [35]:
data_recommendation = pd.merge(recommendation, products, how='left', left_on='product_id_rec', right_on='item_id')
data_recommendation.head()

Unnamed: 0,customer_id,col,product_id_rec,rating_rec,item_id,name,description,rating,price,list_price,brand,group,url,image
0,471,"(76732229, 5.126760959625244)",76732229,5.126761,76732229,máy nóng ariston sbs vn,bơm trợ lực ariston công suất điện áp cxnxs x ...,5.0,2739000,4590000,Ariston,Điện Tử - Điện Lạnh/Máy nước nóng,https://may-nuoc-nong-ariston-smc45pe-sbs-vn-4...,https://salt.tikicdn.com/cache/280x280/ts/prod...
1,471,"(71293311, 5.1080756187438965)",71293311,5.108076,71293311,trọn camera ip wifi ezviz cube full hd thẻ hik...,ezviz trung quốc trung quốc sku ca m biê n pro...,5.0,599000,899000,EZVIZ,Máy Ảnh - Máy Quay Phim/Camera Giám Sát/Camera IP,https://tron-bo-camera-ip-wifi-ezviz-cube-c1c-...,https://salt.tikicdn.com/cache/280x280/ts/prod...
2,471,"(67216147, 4.924155235290527)",67216147,4.924155,67216147,máy đọc sách kindle paperwhite refurbished hàn...,thời gian pin tuần amazon mỹ thời gian sạc hỗ ...,0.0,2379000,2990000,Amazon,Điện Thoại - Máy Tính Bảng/Máy đọc sách,https://may-doc-sach-kindle-paperwhite-2018-7t...,https://salt.tikicdn.com/cache/280x280/ts/prod...
3,471,"(73844240, 4.851090908050537)",73844240,4.851091,73844240,case máy mik nexus m hàng hãng,nexus đài loan chất liệu nhựa thép l w h model...,5.0,590000,690000,Nexus,Laptop - Máy Vi Tính - Linh kiện/Linh Kiện Máy...,https://case-may-tinh-mik-nexus-m-hang-chinh-h...,https://salt.tikicdn.com/cache/280x280/ts/prod...
4,471,"(32198166, 4.834904670715332)",32198166,4.834905,32198166,ổ cứng di động wd my book usb hàng nhập khẩu,wd dung ổ cứng sku sản phẩm wd my book inch gi...,4.3,3000000,4320000,WD,Laptop - Máy Vi Tính - Linh kiện/Thiết Bị Lưu ...,https://o-cung-di-dong-wd-my-book-4tb-3-5-usb-...,https://salt.tikicdn.com/cache/280x280/ts/prod...


## Give recommendation for any customer

In [36]:
# Give recommender for a user
customer_id = 709310

results = data_recommendation[data_recommendation['customer_id'] == customer_id]
results

Unnamed: 0,customer_id,col,product_id_rec,rating_rec,item_id,name,description,rating,price,list_price,brand,group,url,image
56975,709310,"(76732229, 2.918865203857422)",76732229,2.918865,76732229,máy nóng ariston sbs vn,bơm trợ lực ariston công suất điện áp cxnxs x ...,5.0,2739000,4590000,Ariston,Điện Tử - Điện Lạnh/Máy nước nóng,https://may-nuoc-nong-ariston-smc45pe-sbs-vn-4...,https://salt.tikicdn.com/cache/280x280/ts/prod...
56976,709310,"(71293311, 2.8983428478240967)",71293311,2.898343,71293311,trọn camera ip wifi ezviz cube full hd thẻ hik...,ezviz trung quốc trung quốc sku ca m biê n pro...,5.0,599000,899000,EZVIZ,Máy Ảnh - Máy Quay Phim/Camera Giám Sát/Camera IP,https://tron-bo-camera-ip-wifi-ezviz-cube-c1c-...,https://salt.tikicdn.com/cache/280x280/ts/prod...
56977,709310,"(67216147, 2.839005470275879)",67216147,2.839005,67216147,máy đọc sách kindle paperwhite refurbished hàn...,thời gian pin tuần amazon mỹ thời gian sạc hỗ ...,0.0,2379000,2990000,Amazon,Điện Thoại - Máy Tính Bảng/Máy đọc sách,https://may-doc-sach-kindle-paperwhite-2018-7t...,https://salt.tikicdn.com/cache/280x280/ts/prod...
56978,709310,"(73844240, 2.8042171001434326)",73844240,2.804217,73844240,case máy mik nexus m hàng hãng,nexus đài loan chất liệu nhựa thép l w h model...,5.0,590000,690000,Nexus,Laptop - Máy Vi Tính - Linh kiện/Linh Kiện Máy...,https://case-may-tinh-mik-nexus-m-hang-chinh-h...,https://salt.tikicdn.com/cache/280x280/ts/prod...
56979,709310,"(10001012, 2.7574002742767334)",10001012,2.7574,10001012,camera ip wifi trời ezviz cs hàng hãng,ezviz khối gói hàng kg đóng gói cm x x model e...,5.0,789000,1746000,EZVIZ,Máy Ảnh - Máy Quay Phim/Camera Giám Sát/Camera IP,https://camera-ip-wifi-ngoai-troi-ezviz-cs-cv3...,https://salt.tikicdn.com/cache/280x280/ts/prod...


In [None]:
# # Give recommender for a user
# customer_id = 709310

# results = cus_recs.filter(cus_recs['customer_id'] == customer_id)
# results.show(truncate = False)

In [None]:
# result = []
# for user in results.collect():
#     lst = []
#     for row in user['recommendations']:
#         print(row)
#         row_f = reviews_sub.filter(reviews_sub.product_id == row['product_id'])  
#         lst.append((row['product_id'], row['rating']))
#     dic_user_rec = {'customer_id' : user.customer_id, 'recommendations' :lst}
#     result = dic_user_rec

In [None]:
# results.printSchema()

In [None]:
# results.show(2)

In [None]:
# # Chuẩn hóa dữ liệu cho user
# from pyspark.sql.functions import col, explode

# results = results.select(results.customer_id, explode(results.recommendations))
# results = results.withColumn('product_id', results.col.getField("product_id"))\
#                   .withColumn('rating', results.col.getField("rating"))

# results.show()

In [None]:
# Lọc đề xuất trên ngưỡng
# results.filter(results.rating >= 3.0).show()

In [None]:
# Save
# cus_recs.write.parquet('user_recs.parquet', mode='ovewrite') # Có product_id và rating tương ứng với user_id bất kì