In [None]:
spark

In [None]:
import pandas as pd
import pyspark.sql.functions as F

# Data exploration

## Data preparation

In [None]:
!wget http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx -P data/UCI_online_retail

In [None]:
pdf = pd.read_excel("data/UCI_online_retail/Online Retail.xlsx")

In [None]:
pdf['Description'] = pdf['Description'].str.replace(',',' ')

In [None]:
(pdf.Quantity < 0).sum()

In [None]:
(pdf.CustomerID.isna()).sum()

In [None]:
pdf[(pdf.CustomerID.isna())].head()

In [None]:
(pdf.StockCode.str.len()==0).sum()

In [None]:
pdf.to_csv("data/UCI_online_retail/Online Retail.csv")

In [None]:
del pdf

## Loading data

In [None]:
df = spark.read.option("header", True).csv("data/UCI_online_retail/Online Retail.csv")

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
df.count()

In [None]:
df.filter(F.isnull(F.regexp_replace(df.StockCode,"\D","").cast("integer"))).show()

In [None]:
df\
    .filter(F.isnull(F.regexp_replace(df.StockCode,"\D","").cast("integer")))\
    .select('StockCode').distinct()\
    .show()

In [None]:
df\
    .filter(F.isnull(df.CustomerID.cast("integer")))\
    .select('CustomerID').distinct()\
    .show()

In [None]:
df = df\
    .filter(F.col('Quantity')>0)\
    .withColumn('CustomerID', F.col('CustomerID').cast("integer"))\
    .withColumn('StockCode', F.regexp_replace(F.col('StockCode'),"\D","").cast("integer"))\
    .filter(~F.isnull(F.col('CustomerID')))\
    .filter(~F.isnull(F.col('StockCode')))

In [None]:
df.filter(F.isnull(F.col('StockCode'))).show()

In [None]:
data = df\
    .withColumn('purch',F.lit(1))\
    .select('StockCode','CustomerID','purch')
data.show()

In [None]:
data.printSchema()

In [None]:
data.filter(F.isnull(data.StockCode)).show()

# Train and evaluate

In [None]:
testDf, cvDf, trainDf = data.randomSplit([.1,.1,.8],1)

In [None]:
trainDf.show(2)
cvDf.show(2)
testDf.show(2)

In [None]:
from pyspark.ml.recommendation import ALS

als1 = ALS(rank=3, maxIter=15,userCol="CustomerID",itemCol="StockCode",ratingCol="purch",implicitPrefs=True)
#als1.setColdStartStrategy("drop")
model1 = als1.fit(trainDf)

#als2 = ALS(rank=15, maxIter=3,userCol="CustomerID",itemCol="StockCode",ratingCol="purch",implicitPrefs=True)
#model2 = als2.fit(trainDf)

#als3 = ALS(rank=15, maxIter=15,userCol="CustomerID",itemCol="StockCode",ratingCol="purch",implicitPrefs=True)
#model3 = als3.fit(trainDf)

print("The models has been trained")

# Validation

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
predictions = model1.transform(cvDf)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="purch", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
predictions.filter(F.isnan(F.col('prediction'))).count()

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="purch", predictionCol="prediction")
rmse = evaluator.evaluate(predictions.dropna())
print("Root-mean-square error = " + str(rmse))

In [None]:
predictions.show()

In [None]:
meanSquaredError1 = predictions\
    .agg(F.mean((F.col('purch') - F.col("prediction"))**2).alias("meanSqErr"))
meanSquaredError1.first()

In [None]:
type(model1)

In [None]:
model1.save("data/model/als1")

In [None]:
from pyspark.ml.recommendation import ALSModel

In [None]:
model = ALSModel.load("data/model/als1")

In [None]:
evaluator.evaluate(model.transform(cvDf).dropna())

# GridSearch Hyper Param opt

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder

In [None]:
als = ALS(userCol="CustomerID",itemCol="StockCode",ratingCol="purch",implicitPrefs=True, coldStartStrategy='drop')
grid = ParamGridBuilder() \
    .addGrid(als.rank, [3, 15]) \
    .addGrid(als.maxIter, [3, 5]) \
    .build()
#    .baseOn({als.userCol: 'CustomerID'}) \
#    .baseOn([als.itemCol, 'StockCode']) \
#    .baseOn([als.ratingCol, 'purch']) \
#    .baseOn([als.implicitPrefs, True]) \
print(len(grid))

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="purch", predictionCol="prediction")
cv = CrossValidator(estimator=als1, estimatorParamMaps=grid, evaluator=evaluator,parallelism=2)

In [None]:
cvModel = cv.fit(data)

In [None]:
predictions = cvModel.transform(cvDf)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="purch", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

In [None]:
print(cvModel.bestModel.explainParams())

In [None]:
cvModel.bestModel.extractParamMap()

In [None]:
cvModel.bestModel

In [None]:
print(cvModel.bestModel.rank)

In [None]:
cvModel.bestModel.itemFactors.count()