## Import libraries

In [1]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

## Create Spark Session

In [2]:
def create_spark():
    """ Create a SparkSession object. """
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("TestSuite") \
        .config(key='spark.sql.shuffle.partitions', value='4') \
        .config(key='spark.default.parallelism', value='4') \
        .config(key='spark.sql.session.timeZone', value='UTC') \
        .config(key='spark.ui.enabled', value='false') \
        .config(key='spark.app.id', value='Test') \
        .config(key='spark.driver.host', value='localhost') \
        .getOrCreate()

    return spark

In [3]:
spark = create_spark()

## Load Dataset

In [9]:
path_to_dataset = '../../dataset/amazon_electronics.csv'
amazon_spark_df = spark.read.csv(path_to_dataset, header=True, inferSchema=True)

In [10]:
amazon_spark_df.show(5)

+--------------+----------+------+----------+
|       user_id|product_id|rating|timestamp |
+--------------+----------+------+----------+
| AKM1MP6P0OYPR| 132793040|     5|1365811200|
|A2CX7LUOHB2NDG| 321732944|     5|1341100800|
|A2NWSAGRHCP8N5| 439886341|     1|1367193600|
|A2WNBOD3WNDNKT| 439886341|     3|1374451200|
|A1GI0U4ZRJA8WN| 439886341|     1|1334707200|
+--------------+----------+------+----------+
only showing top 5 rows


## Data Preprocessing

In [11]:
user_indexer = StringIndexer(inputCol='user_id', outputCol='user_index')
product_indexer = StringIndexer(inputCol='product_id', outputCol='product_index')

indexed_data = user_indexer.fit(amazon_spark_df).transform(amazon_spark_df)
indexed_data = product_indexer.fit(indexed_data).transform(indexed_data)

## Split Dataset

In [12]:
train, test = indexed_data.randomSplit([0.8, 0.2], seed=42)

## Build ALS model

In [None]:
als = ALS(userCol='user_index',
          itemCol='product_index',
          ratingCol='rating',
          coldStartStrategy='drop',
          nonnegative=True)

model = als.fit(train)

## Evaluation

In [None]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(labelCol='rating', metricName='rmse')
rmse_score = evaluator.evaluate(predictions)
print('RMSE: ', rmse_score)

In [None]:
# 26/01/11 13:09:23 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
# 26/01/11 13:09:26 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
# 26/01/11 13:09:28 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
# 26/01/11 13:09:40 WARN DAGScheduler: Broadcasting large task binary with size 36.1 MiB
# 26/01/11 13:09:43 WARN DAGScheduler: Broadcasting large task binary with size 13.4 MiB
# 26/01/11 13:09:45 WARN DAGScheduler: Broadcasting large task binary with size 13.4 MiB
# 26/01/11 13:09:46 WARN DAGScheduler: Broadcasting large task binary with size 13.4 MiB
# [Stage 224:>                                                        (0 + 4) / 4]
# RMSE:  1.960614091465835

## Show first 5 predictions

In [None]:
predictions.select('user_id', 'product_id', 'rating', 'prediction').show(5)

In [None]:
# +--------------+----------+------+----------+
# |       user_id|product_id|rating|prediction|
# +--------------+----------+------+----------+
# | A680RUE1FDO8B|B0006ZKX7Y|     5|  4.024588|
# | A680RUE1FDO8B|B000F9WYKA|     5|  4.008407|
# | A680RUE1FDO8B|B000G0JD0C|     5| 3.6363602|
# | A680RUE1FDO8B|B000I4PS3W|     5|   4.18065|
# |A1F9Z42CFF9IAY|B00009KAPW|     1| 3.6061687|
# +--------------+----------+------+----------+
# only showing top 5 rows