### Invoke SparkContext

In [1]:
import findspark
findspark.init("C:/Users/Jonas/spark")
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('Spark Lab1')
sc = SparkContext(conf=conf)
sc

### SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Spark Project") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
spark.sparkContext

### Load data

In [4]:
from pyspark import SQLContext
reviews = spark.read.options(header=True).csv("amazon_reviews_us_Mobile_Electronics_v1_00.tsv", sep="\t")

### See structure of data

In [5]:
reviews.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



### Display data and show significant attributes

In [6]:
from pyspark.sql.functions import *

reviews.show()

reviews.groupBy('star_rating').count().orderBy(desc('count')).show()

reviews.groupBy('customer_id').count().orderBy(desc('count')).show()

#reviews.select('product_id').distinct().count()

+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|  product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   20422322| R8MEA6IGAHO0B|B00MC4CED8|     217304173|BlackVue DR600GW-PMP|Mobile_Electronics|          5|            0|          0|   N|                Y|         Very Happy!|As advertised. Ev...| 2015-08-31|
|         US|   40835037|R31LOQ8JGLPRLK|B00OQMFG1Q|     137313254|GENSSI GSM / GPS ...|Mobile_Electronics|      

### See that there are more product id's than product titles

In [7]:
reviews.select('product_title').distinct().count()

24770

In [8]:
reviews.select('product_id').distinct().count()

25801

### Drop attributes of no importance

In [9]:
reviews = reviews.drop('vine', 'marketplace', 'helpful_votes', 'review_date', 'product_category')
reviews.show()

+-----------+--------------+----------+--------------+--------------------+-----------+-----------+-----------------+--------------------+--------------------+
|customer_id|     review_id|product_id|product_parent|       product_title|star_rating|total_votes|verified_purchase|     review_headline|         review_body|
+-----------+--------------+----------+--------------+--------------------+-----------+-----------+-----------------+--------------------+--------------------+
|   20422322| R8MEA6IGAHO0B|B00MC4CED8|     217304173|BlackVue DR600GW-PMP|          5|          0|                Y|         Very Happy!|As advertised. Ev...|
|   40835037|R31LOQ8JGLPRLK|B00OQMFG1Q|     137313254|GENSSI GSM / GPS ...|          5|          1|                Y|           five star|          it's great|
|   51469641|R2Y0MM9YE6OP3P|B00QERR5CY|      82850235|iXCC Multi pack L...|          5|          0|                Y|        great cables|These work great ...|
|    4332923| RRB9C05HDOD4O|B00QUFTPV4| 

### Drop purchases that are not verified

In [10]:
reviews = reviews.filter(reviews['verified_purchase'] == 'Y')

### See if there are any missing values

In [11]:
reviews.select([count(when(col(column).isNull(), 1)).alias(column) for column in reviews.columns]).show()

+-----------+---------+----------+--------------+-------------+-----------+-----------+-----------------+---------------+-----------+
|customer_id|review_id|product_id|product_parent|product_title|star_rating|total_votes|verified_purchase|review_headline|review_body|
+-----------+---------+----------+--------------+-------------+-----------+-----------+-----------------+---------------+-----------+
|          0|        0|         0|             0|            0|          0|          0|                0|              0|          2|
+-----------+---------+----------+--------------+-------------+-----------+-----------+-----------------+---------------+-----------+



### See how many products a customer has reviewed as well as how many customers have reviewed a product

In [12]:
products_per_reviewer = reviews.select('customer_id', 'product_id').rdd.map(lambda line: (line[0], line[1])).groupByKey().mapValues(list)
products_per_reviewer.take(20)

reviewers_per_product = reviews.select('product_id', 'customer_id').rdd.map(lambda line: (line[0], line[1])).groupByKey().mapValues(list)
reviewers_per_product.first()

('B00IK9UZXA',
 ['27920584',
  '41176333',
  '46649972',
  '4045807',
  '11848841',
  '50531937',
  '38851576',
  '25722326',
  '39977926',
  '52676254',
  '53014729',
  '20376268',
  '45099581',
  '16604212',
  '31938511',
  '7638202',
  '43420453',
  '45377877',
  '12895448',
  '36652076',
  '36857302',
  '48456827',
  '9608052',
  '725660',
  '25115390',
  '14735526',
  '42764494',
  '13242611',
  '21480619',
  '43086607',
  '27819527',
  '18958874',
  '14891305',
  '32627436',
  '11763091',
  '23502590',
  '30741919',
  '46506176',
  '575100',
  '21033751',
  '26358287',
  '23049788',
  '3911458',
  '21327655',
  '469867',
  '30340217',
  '1698807',
  '12570796',
  '14744266',
  '13176295',
  '24880345',
  '43208018',
  '1835274',
  '25603004',
  '13286680',
  '39872080',
  '1346655',
  '31686046',
  '42882972',
  '44379744',
  '15064996',
  '42308476',
  '45937962',
  '40008316',
  '49703771',
  '18583037',
  '41826099',
  '52574556',
  '36024579',
  '43245605',
  '25397638',
  '4

### One-hot encoding 

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# To one-hot encode product_id.
prod_indexer = StringIndexer(inputCol='product_id', outputCol='product_id_num')
prod_encoder = OneHotEncoder(inputCol='product_id_num', outputCol='product_id_num_vec')

# To one-hot encode customer_id.
cust_indexer = StringIndexer(inputCol='customer_id', outputCol='customer_id_num')
cust_encoder = OneHotEncoder(inputCol='customer_id_num', outputCol='customer_id_num_vec')

# To one-hot encode verified_purchase.
purch_indexer = StringIndexer(inputCol='verified_purchase', outputCol='verified_purchase_num')
purch_encoder = OneHotEncoder(inputCol='verified_purchase_num', outputCol='verified_purchase_num_vec')

# To one-hot encode star_rating.
star_indexer = StringIndexer(inputCol='star_rating', outputCol='star_rating_num')
star_encoder = OneHotEncoder(inputCol='star_rating_num', outputCol='star_rating_num_vec')


prod_pipe = Pipeline(stages = [prod_indexer, prod_encoder])
cust_pipe = Pipeline(stages = [cust_indexer, cust_encoder])
purch_pipe = Pipeline(stages = [purch_indexer, purch_encoder])
star_pipe = Pipeline(stages = [star_indexer, star_encoder])

pipe = Pipeline(stages = [prod_pipe, cust_pipe, star_pipe])

reviews_enc = pipe.fit(reviews).transform(reviews)
reviews_enc.show()

+-----------+--------------+----------+--------------+--------------------+-----------+-----------+-----------------+--------------------+--------------------+--------------+--------------------+---------------+--------------------+---------------+-------------------+
|customer_id|     review_id|product_id|product_parent|       product_title|star_rating|total_votes|verified_purchase|     review_headline|         review_body|product_id_num|  product_id_num_vec|customer_id_num| customer_id_num_vec|star_rating_num|star_rating_num_vec|
+-----------+--------------+----------+--------------+--------------------+-----------+-----------+-----------------+--------------------+--------------------+--------------+--------------------+---------------+--------------------+---------------+-------------------+
|   20422322| R8MEA6IGAHO0B|B00MC4CED8|     217304173|BlackVue DR600GW-PMP|          5|          0|                Y|         Very Happy!|As advertised. Ev...|        3015.0|(22311,[3015],[1.0]

### Train ALS model

In [14]:
from pyspark.ml.recommendation import ALS
# Split into train and test data.
training, test = reviews_enc.randomSplit([0.8, 0.2])

als = ALS(userCol='customer_id_num', itemCol='product_id_num', ratingCol='star_rating_num', \
          implicitPrefs=True , coldStartStrategy="drop")
trained_model = als.fit(training)

### Test ALS model

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = trained_model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse', labelCol='star_rating_num', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root mean squared error: {rmse}')

Root mean squared error: 1.4840263475739197


### Cross-validation

In [17]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

params = ParamGridBuilder().addGrid(als.maxIter, [5,10]).addGrid(als.regParam, [0.1, 0.2, 0.5]).build()
cv = CrossValidator(estimator=als, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)
cv_model = cv.fit(reviews_enc)
predictions = cv_model.transform(reviews_enc)
rmse = evaluator.evaluate(predictions)

print( "Root Mean Squared Error: "+ str(rmse))

Root Mean Squared Error: 1.6100207885929898
