### Invoke SparkContext

In [1]:
import findspark
findspark.init("C:/Users/Jonas/spark")
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('Spark project')
sc = SparkContext(conf=conf)
sc

### SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Spark Project") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
spark.sparkContext

### Load data

In [4]:
from pyspark import SQLContext
reviews = spark.read.options(header=True).csv("amazon_reviews_us_Mobile_Electronics_v1_00.tsv", sep="\t")

### See structure of data

In [5]:
reviews.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



### Display data and show significant attributes

In [6]:
from pyspark.sql.functions import *

reviews.show()

+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|  product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   20422322| R8MEA6IGAHO0B|B00MC4CED8|     217304173|BlackVue DR600GW-PMP|Mobile_Electronics|          5|            0|          0|   N|                Y|         Very Happy!|As advertised. Ev...| 2015-08-31|
|         US|   40835037|R31LOQ8JGLPRLK|B00OQMFG1Q|     137313254|GENSSI GSM / GPS ...|Mobile_Electronics|      

### Show distribution between ratings and how many reviews each customer has written

In [7]:
reviews.groupBy('star_rating').count().orderBy(desc('count')).show()
reviews.groupBy('customer_id').count().orderBy(desc('count')).show()

+-----------+-----+
|star_rating|count|
+-----------+-----+
|          5|52255|
|          4|18088|
|          1|17587|
|          3| 9734|
|          2| 7311|
+-----------+-----+

+-----------+-----+
|customer_id|count|
+-----------+-----+
|   15127646|   25|
|   29514513|   21|
|   12259799|   18|
|   19803990|   10|
|   43856165|   10|
|   52460215|   10|
|   34408569|   10|
|   53037408|    9|
|   49675502|    9|
|   50027179|    8|
|   45070473|    8|
|   53090839|    8|
|   51666042|    8|
|   32038204|    8|
|   38773014|    7|
|   34645354|    7|
|   44834233|    7|
|   51346302|    7|
|    7394955|    7|
|   17957446|    7|
+-----------+-----+
only showing top 20 rows



### See that there are more product id's than product titles

In [8]:
reviews.select('product_title').distinct().count()

24770

In [9]:
reviews.select('product_id').distinct().count()

25801

### Drop attributes of no importance

In [10]:
reviews = reviews.select(['customer_id', 'review_id', 'product_id', 'product_title', 'star_rating', 'verified_purchase'])
#reviews = reviews.drop('vine', 'marketplace', 'helpful_votes', 'review_date', 'product_category')
reviews.show()

+-----------+--------------+----------+--------------------+-----------+-----------------+
|customer_id|     review_id|product_id|       product_title|star_rating|verified_purchase|
+-----------+--------------+----------+--------------------+-----------+-----------------+
|   20422322| R8MEA6IGAHO0B|B00MC4CED8|BlackVue DR600GW-PMP|          5|                Y|
|   40835037|R31LOQ8JGLPRLK|B00OQMFG1Q|GENSSI GSM / GPS ...|          5|                Y|
|   51469641|R2Y0MM9YE6OP3P|B00QERR5CY|iXCC Multi pack L...|          5|                Y|
|    4332923| RRB9C05HDOD4O|B00QUFTPV4|abcGoodefg® FBI C...|          4|                Y|
|   44855305|R26I2RI1GFV8QG|B0067XVNTG|Generic Car Dashb...|          2|                Y|
|    7846966| RY8DDL22YG4R5|B00KA6CCVY|Aweek® Air Acoust...|          3|                Y|
|   21299354|R2AT2426ZHFUHH|B00MJCDPM2|Sentey LS-4460 B-...|          3|                Y|
|   28902968|R3RRXU2R23NMQ9|B00ET5AWBY|iPad Car Headrest...|          5|                Y|

### Drop purchases that are not verified

In [11]:
reviews = reviews.filter(reviews['verified_purchase'] == 'Y')

### See if there are any missing values

In [12]:
reviews.select([count(when(col(column).isNull(), 1)).alias(column) for column in reviews.columns]).show()

+-----------+---------+----------+-------------+-----------+-----------------+
|customer_id|review_id|product_id|product_title|star_rating|verified_purchase|
+-----------+---------+----------+-------------+-----------+-----------------+
|          0|        0|         0|            0|          0|                0|
+-----------+---------+----------+-------------+-----------+-----------------+



### One-hot encoding with pipelines

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# To one-hot encode product_id.
prod_indexer = StringIndexer(inputCol='product_id', outputCol='product_id_num')
prod_encoder = OneHotEncoder(inputCol='product_id_num', outputCol='product_id_num_vec')

# To one-hot encode customer_id.
cust_indexer = StringIndexer(inputCol='customer_id', outputCol='customer_id_num')
cust_encoder = OneHotEncoder(inputCol='customer_id_num', outputCol='customer_id_num_vec')

# To one-hot encode star_rating.
star_indexer = StringIndexer(inputCol='star_rating', outputCol='star_rating_num')
star_encoder = OneHotEncoder(inputCol='star_rating_num', outputCol='star_rating_num_vec')


prod_pipe = Pipeline(stages = [prod_indexer, prod_encoder])
cust_pipe = Pipeline(stages = [cust_indexer, cust_encoder])
star_pipe = Pipeline(stages = [star_indexer, star_encoder])

pipe = Pipeline(stages = [prod_indexer, cust_indexer, star_indexer])

reviews_enc = pipe.fit(reviews).transform(reviews)
reviews_enc.show()

+-----------+--------------+----------+--------------------+-----------+-----------------+--------------+---------------+---------------+
|customer_id|     review_id|product_id|       product_title|star_rating|verified_purchase|product_id_num|customer_id_num|star_rating_num|
+-----------+--------------+----------+--------------------+-----------+-----------------+--------------+---------------+---------------+
|   20422322| R8MEA6IGAHO0B|B00MC4CED8|BlackVue DR600GW-PMP|          5|                Y|        3015.0|        27489.0|            0.0|
|   40835037|R31LOQ8JGLPRLK|B00OQMFG1Q|GENSSI GSM / GPS ...|          5|                Y|       21405.0|        58223.0|            0.0|
|   51469641|R2Y0MM9YE6OP3P|B00QERR5CY|iXCC Multi pack L...|          5|                Y|          12.0|        78362.0|            0.0|
|    4332923| RRB9C05HDOD4O|B00QUFTPV4|abcGoodefg® FBI C...|          4|                Y|        3877.0|        62750.0|            1.0|
|   44855305|R26I2RI1GFV8QG|B0067X

### Pick only the numerical attributes which the recommendations will be based on

In [14]:
reviews_enc_only = reviews_enc.select('customer_id_num', 'product_id_num', 'star_rating_num')
reviews_enc_only.show()

+---------------+--------------+---------------+
|customer_id_num|product_id_num|star_rating_num|
+---------------+--------------+---------------+
|        27489.0|        3015.0|            0.0|
|        58223.0|       21405.0|            0.0|
|        78362.0|          12.0|            0.0|
|        62750.0|        3877.0|            1.0|
|          190.0|           5.0|            4.0|
|        84108.0|         976.0|            3.0|
|        29093.0|          81.0|            3.0|
|        41650.0|        1073.0|            0.0|
|        28123.0|       22265.0|            1.0|
|        73231.0|        3894.0|            0.0|
|        43074.0|        1315.0|            0.0|
|        81710.0|        2974.0|            3.0|
|        74352.0|         153.0|            0.0|
|        59441.0|       22304.0|            2.0|
|        24604.0|       22125.0|            4.0|
|        40367.0|         219.0|            0.0|
|        23639.0|        4951.0|            0.0|
|        73607.0|   

### Train ALS model

In [15]:
from pyspark.ml.recommendation import ALS
# Split into train and test data.
training, test = reviews_enc_only.randomSplit([0.8, 0.2])

als = ALS(userCol='customer_id_num', itemCol='product_id_num', ratingCol='star_rating_num', \
          implicitPrefs=True , coldStartStrategy="drop", nonnegative=True)
trained_model = als.fit(training)

### Test ALS model

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = trained_model.transform(test)

evaluator = RegressionEvaluator(metricName='rmse', labelCol='star_rating_num', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f'Root mean squared error: {rmse}')

Root mean squared error: 1.4790625749913373


### Cross-validation

In [24]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

params = ParamGridBuilder().addGrid(als.maxIter, [5, 10, 15])\
                            .addGrid(als.regParam, [0.001, 0.1, 0.5, 5, 10])\
                            .addGrid(als.rank, [1, 5, 10, 20]).build()
cv = CrossValidator(estimator=als, estimatorParamMaps=params, evaluator=evaluator, numFolds=4)
cv_model = cv.fit(training)
predictions = cv_model.transform(test)
rmse = evaluator.evaluate(predictions)

print( "Root Mean Squared Error: "+ str(rmse))

Root Mean Squared Error: 1.4785828098110796


In [23]:
import numpy as np
print(cv_model.getEstimatorParamMaps()[np.argmax(cv_model.avgMetrics)])

{Param(parent='ALS_960aec482cc1', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='ALS_960aec482cc1', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='ALS_960aec482cc1', name='rank', doc='rank of the factorization'): 5}


### Create customer class which can present the recommended products for the customer

In [18]:
import random

class Customer():
    def __init__(self):
        self.id = None
        self.purchase = {'product': None, 'rating': None}
        self.recommendations = None
        self.recommended_products = None
        self.customer_id_num = None
        self.dataset = None
        self.dataset_num = None
        
    def buyProduct(self, product, rating):
        self.purchase.update({'product': product, 'rating': rating})
        
    
    # Converts string values to numerical values.
    def IDToNum(self, the_dataset):
        cust_indexer = StringIndexer(inputCol='customer_id',outputCol='customer_id_num')
        prod_indexer = StringIndexer(inputCol='product_id', outputCol='product_id_num')
        star_indexer = StringIndexer(inputCol='star_rating', outputCol='star_rating_num')

        pipe = Pipeline(stages=[cust_indexer, prod_indexer, star_indexer])

        return pipe.fit(the_dataset).transform(the_dataset)

        
    def insertUser(self, the_dataset):
        temp_df = the_dataset.select(['customer_id', 'product_title', 'product_id', 'star_rating'])
        user_id_rows = the_dataset.select(['customer_id']).collect()
        user_ids = [int(customer_id[0].encode('ascii')) for customer_id in user_id_rows]
        temp = user_ids[0]      # An ID that already exists in the data set.
        while temp in user_ids:
            temp = random.randrange(1, 100000)
        self.id = str(temp)
        

        # Find out the product_id of the bought product and fetch the first product_id returned.
        product_id = self.getProductID(temp_df, self.purchase.get('product'))
        
        # Add new user to data set.
        new_row = spark.createDataFrame([(self.id, self.purchase.get('product'), product_id, self.purchase.get('rating'))])
        # Append the new row.
        
        self.dataset = temp_df.union(new_row)

        
        self.dataset_num = self.IDToNum(self.dataset)
        self.customer_id_num = self.dataset_num.select(['customer_id_num']).filter(self.dataset_num['customer_id'] == self.id).collect()[0][0]
    
    def trainModel(self, n_recommendations):
        als = ALS(userCol='customer_id_num', itemCol='product_id_num', ratingCol='star_rating_num', \
                  implicitPrefs=True , coldStartStrategy="drop")
        self.recommendations = als.fit(self.dataset_num).recommendForAllUsers(n_recommendations)
        
    
    def queryRecommendations(self):
        recommendations_list = []
        
        # Index of new user.
        idx_new_user = [product_id_num[0] for product_id_num in self.recommendations.select(['customer_id_num'])\
                        .collect()].index(self.customer_id_num)
        # List of recommendations.
        recommendations = [recommendation for recommendation in self.recommendations.select('recommendations')\
                           .filter(self.recommendations['customer_id_num'] == self.customer_id_num).collect()[0][0]]
        
        # Get title of every recommended product.
        for index, recommendation in enumerate(recommendations):
            recommendations_list.append(\
                                self.dataset_num.select(['product_title']).filter(self.dataset_num['product_id_num'] == recommendation[0]).collect()[0][0])

        self.recommended_products = recommendations_list
    
    def getProductID(self, the_dataset, product):
        return the_dataset.select(['product_id']).filter(the_dataset['product_title'] == product).collect()[0][0]
    
    def printRecommendations(self):
        print(f"Based on your purchase of {self.purchase.get('product')}, you might also like:")
        for number, recommendation in enumerate(self.recommended_products, 1):
            print(f'{number}): {recommendation}')

In [19]:
new_customer = Customer()
new_customer.buyProduct('Sentey Ls 4162', '4')
new_customer.insertUser(reviews_enc)
new_customer.trainModel(5)
new_customer.queryRecommendations()
new_customer.printRecommendations()


Based on your purchase of Sentey Ls 4162, you might also like:
1): eForCity Leather Case for Barnes and Noble Nook Color, Purple
2): Zeimax (TM) Adapter Converter 8 Pin to 30pin for Apple Iphone 5, Ipod 5 Nano 7th Gen USA
3): DBTech 175 Watt Portable Micro Power Inverter With USB Port - 12v AC to 110v DC Car Plug Converter For Your Ipod, iPad, iPhone, Tablet, PSP, DVD Players Laptops Netbooks And cellphones
4): 6-Pack Mirror Screen Protector for Apple iPod Touch 4th Gen
5): 2-Port USB Car Charger Adapter
