In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
from pyspark.sql.types import *

In [4]:
sc = SparkContext()
spark = SparkSession(sc)

In [9]:
data = spark.read.csv('Reviews.csv', header = True)

In [10]:
data.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- title: string (nullable = true)
 |-- content: string (nullable = true)



In [11]:
data.show(5)

+---+-----------+----------+-----------------+------+------------------+--------------------+
|_c0|customer_id|product_id|             name|rating|             title|             content|
+---+-----------+----------+-----------------+------+------------------+--------------------+
|  0|     709310|  10001012| Lân Nguyễn Hoàng|     3|Ko dùng đc thẻ nhớ|Lúcđầu quên thông...|
|  1|   10701688|  10001012| Nguyễn Khánh Hòa|     5|   Cực kì hài lòng|Tiki giao hàng nh...|
|  2|   11763074|  10001012|  Toàn Phạm Khánh|     5|   Cực kì hài lòng|chất lượng camera...|
|  3|    9909549|  10001012|Nguyen Quang Minh|     5|      Rất hài lòng|Hàng được đóng gó...|
|  4|    1827148|  10001012|      Phạm Bá Đức|     5|   Cực kì hài lòng|dễ cài đặt, chất ...|
+---+-----------+----------+-----------------+------+------------------+--------------------+
only showing top 5 rows



In [12]:
data_sub = data[['customer_id', 'product_id', 'rating']]
data_sub.show(5)

+-----------+----------+------+
|customer_id|product_id|rating|
+-----------+----------+------+
|     709310|  10001012|     3|
|   10701688|  10001012|     5|
|   11763074|  10001012|     5|
|    9909549|  10001012|     5|
|    1827148|  10001012|     5|
+-----------+----------+------+
only showing top 5 rows



In [14]:
data_sub = data_sub.withColumn('rating', data_sub['rating'].cast(DoubleType()))

In [15]:
data_sub.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- rating: double (nullable = true)



In [16]:
data_sub.show(5)

+-----------+----------+------+
|customer_id|product_id|rating|
+-----------+----------+------+
|     709310|  10001012|   3.0|
|   10701688|  10001012|   5.0|
|   11763074|  10001012|   5.0|
|    9909549|  10001012|   5.0|
|    1827148|  10001012|   5.0|
+-----------+----------+------+
only showing top 5 rows



In [17]:
users = data_sub.select('customer_id').distinct().count()
products = data_sub.select('product_id').distinct().count()
numerator = data_sub.count()

In [18]:
denominator = users * products
denominator

1167087984

In [19]:
sparsity = 1 - (numerator * 1.0 / denominator)

print('Sparsity: {}'.format(sparsity))

Sparsity: 0.9996891596820691


In [27]:
# from pyspark.ml.feature import StringIndexer
# from pyspark.ml import Pipeline
# from pyspark.ml.recommendation import ALS
# from pyspark.ml.evaluation import RegressionEvaluator

# indexer = StringIndexer(inputCol = 'product_id', outputCol = 'product_idx')
# indexer_model = indexer.fit(data_sub)
# data_indexed = indexer_model.transform(data_sub)

# indexer1 = StringIndexer(inputCol = 'customer_id', outputCol = 'customer_idx')
# indexer1_model = indexer1.fit(data_indexed)
# data_indexed = indexer1_model.transform(data_indexed)

In [22]:
data_indexed.show(5)

+-----------+----------+------+-----------+
|customer_id|product_id|rating|product_idx|
+-----------+----------+------+-----------+
|     709310|  10001012|   3.0|     2456.0|
|   10701688|  10001012|   5.0|     2456.0|
|   11763074|  10001012|   5.0|     2456.0|
|    9909549|  10001012|   5.0|     2456.0|
|    1827148|  10001012|   5.0|     2456.0|
+-----------+----------+------+-----------+
only showing top 5 rows



In [23]:
data_indexed.count()

362778

In [24]:
train_data, test_data = data_indexed.randomSplit([0.8, 0.2], seed = 42)