In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [2]:

spark = SparkSession \
  .builder \
  .master("local[*]")\
  .appName("New-Spark") \
  .config("spark.memory.fraction", 0.8) \
  .config("spark.executor.memory", "10g") \
  .config("spark.driver.memory", "10g")\
  .config("spark.sql.shuffle.partitions" , "800") \
  .config("spark.memory.offHeap.enabled",'true')\
  .config("spark.memory.offHeap.size","10g")\
  .getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv("ratings_Beauty.csv", header=False, inferSchema=True)
df = df.withColumnRenamed('_c1', 'product_id')
df = df.withColumnRenamed('_c0', 'user_id')
df = df.withColumnRenamed('_c2', 'label')
df = df.select('product_id', 'user_id', 'label')
df.show(5)

+----------+--------------+-----+
|product_id|       user_id|label|
+----------+--------------+-----+
|0205616461|A39HTATAQ9V7YF|  5.0|
|0558925278|A3JM6GV9MNOF9X|  3.0|
|0558925278|A1Z513UWSAAO0F|  5.0|
|0733001998|A1WMRR494NWEWV|  4.0|
|0737104473|A3IAAVS479H7M7|  1.0|
+----------+--------------+-----+
only showing top 5 rows



In [5]:
df.count()

2023070

In [6]:
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- label: double (nullable = true)



In [7]:
n_product = df.select('product_id').distinct().count()
n_user = df.select('user_id').distinct().count()
print(n_product, n_user)

249274 1210271


In [8]:
# How big the matrix
n_product * n_user

301689093254

In [9]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.pipeline import Pipeline

indexer_product = StringIndexer(inputCol='product_id', outputCol='product_idx')
indexer_user = StringIndexer(inputCol='user_id', outputCol='user_idx')

pre_pipeline = Pipeline(stages=[indexer_product, indexer_user])
pre_pipeline_fitted = pre_pipeline.fit(df)
final_df = pre_pipeline_fitted.transform(df)

final_df.show(5)

+----------+--------------+-----+-----------+--------+
|product_id|       user_id|label|product_idx|user_idx|
+----------+--------------+-----+-----------+--------+
|0205616461|A39HTATAQ9V7YF|  5.0|   145790.0| 70392.0|
|0558925278|A3JM6GV9MNOF9X|  3.0|   103581.0|265306.0|
|0558925278|A1Z513UWSAAO0F|  5.0|   103581.0|552933.0|
|0733001998|A1WMRR494NWEWV|  4.0|   145791.0|536779.0|
|0737104473|A3IAAVS479H7M7|  1.0|   145792.0| 14679.0|
+----------+--------------+-----+-----------+--------+
only showing top 5 rows



In [10]:
# Shoule take 20.000 sample to config the model, and then use model tunned for final all data
sample_df = final_df.sample(0.01, seed=42)
sample_train_df, sample_test_df = sample_df.randomSplit([0.8, 0.2], seed=42)

train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [11]:
sample_df.count()

20396

In [14]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import time
tic = time.time()

als = ALS(maxIter=10,           # Number of iterations
          regParam=0.1,        # Regularization parameter beta
          rank=15,              # Number of features
          numItemBlocks=10,     # Number partitioned to parallelize computation
          alpha=0.5,            # Learning rate
          coldStartStrategy='drop',
          nonnegative=True,
          userCol='user_idx',     
          itemCol='product_idx',
          ratingCol='label')
model = als.fit(sample_train_df)

predictions = model.transform(sample_test_df)
# predictions.show(5)

predictions = predictions.withColumn('difference', col('label') - col('prediction'))
predictions = predictions.withColumn('squared_difference', pow(col('difference'), 2))
rmse = predictions.select(sqrt(avg(col('squared_difference'))).alias('rmse'))

print(rmse.collect())

toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))

[Row(rmse=3.275362290061822)]
Total time: 1813.96 seconds


In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import time

tic = time.time()

als = ALS(maxIter=5,           # Number of iterations
          regParam=0.1,        # Regularization parameter beta
          rank=20,              # Number of features
          numItemBlocks=10,     # Number partitioned to parallelize computation
          alpha=0.001,            # Learning rate
          coldStartStrategy='drop',
          userCol='user_idx',     
          itemCol='product_idx',
          ratingCol='label')
model = als.fit(train_df)

predictions = model.transform(test_df)
predictions.show(5)
evaluator = RegressionEvaluator(metricName='rmse')
rmse = evaluator.evaluate(predictions)
print('RMSE: {:.4f}'.format(rmse))

toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))

+----------+--------------+-----+-----------+--------+----------+
|product_id|       user_id|label|product_idx|user_idx|prediction|
+----------+--------------+-----+-----------+--------+----------+
|B0012134EK|A2F6R85B0XX1CW|  5.0|     1580.0|106667.0|-0.7507449|
|B0012134EK|A2BCACN9UIVLIC|  3.0|     1580.0| 22213.0| 1.0593475|
|B0012134EK|A3V7BQNYTENDW4|  5.0|     1580.0|280215.0|-1.8660998|
|B0012134EK|A2RWVUT42HQAQC|  5.0|     1580.0| 66456.0| 0.2412106|
|B0012134EK| ABMSLHPKJJMYM|  5.0|     1580.0| 49476.0| 0.5722452|
+----------+--------------+-----+-----------+--------+----------+
only showing top 5 rows

RMSE: 4.0028
Total time: 2509.58 seconds


In [19]:
tic = time.time()
predictions = predictions.withColumn('difference', col('label') - col('prediction'))
predictions = predictions.withColumn('squared_difference', pow(col('difference'), 2))
rmse = predictions.select(sqrt(avg(col('squared_difference'))).alias('rmse'))
rmse.show()
toc = time.time()
print('Total time: {:.2f} seconds'.format(toc-tic))

+------------------+
|              rmse|
+------------------+
|16.022636669570772|
+------------------+

Total time: 1220.98 seconds


In [20]:
user_recom = model.recommendForAllUsers(5)
for user in user_recom.head(5):
    print(user)
    print('')

KeyboardInterrupt: 

In [None]:
product_label = pre_pipeline_fitted.stages[0].labels
user_label = pre_pipeline_fitted.stages[1].labels

def convert_product(list_tuple, product_label):
    converted = []
    for product_idx, rating in list_tuple:
        converted.append((product_label[product_idx], rating))
    return converted

convert_recom = udf(lambda x: convert_product(x, product_label), ArrayType(ArrayType(StringType())))
user_recom = user_recom.withColumn('recommendation_product', convert_recom('recommendations'))

convert_user = udf(lambda x: user_label[x], StringType())
user_recom = user_recom.withColumn('user_id', convert_user('user_idx'))

user_recom_by_id = user_recom.select('user_id', 'recommendation_product')
user_recom_by_id.toPandas()