## Setup and input data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/orders/Order.all.20201001_20201031.csv
/kaggle/input/orders/Order.all.20200901_20200930.csv
/kaggle/input/orders/Order.all.20200801_20200831.csv
/kaggle/input/orders/Order.all.20200701_20200731.csv


In [4]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 26 kB/s s eta 0:00:01    |█████████▍                      | 59.8 MB 23.9 MB/s eta 0:00:07     |██████████████████████▊         | 145.3 MB 53.4 MB/s eta 0:00:02
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 38.5 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612244 sha256=8e63943bd8c5c5d4a96fa38998c2c9d3d4982963fa480675c443814dbf05b8b2
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1
You should consider upgrading via the '/opt/conda/bin/python3.

## Import packages and setup Spark

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

import pandas as pd

In [6]:
spark = SparkSession \
    .builder \
    .appName("sales") \
    .config("spark.driver.maxResultSize", "96g") \
    .config("spark.driver.memory", "96g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

## Load data and data cleaning

In [8]:
def read_data(csv_file):
  df = spark.read.csv(csv_file, header=True, inferSchema=True)
  return df

In [9]:
# data from July, August and September
sales_07 = read_data('../input/orders/Order.all.20200701_20200731.csv')
sales_08 = read_data('../input/orders/Order.all.20200801_20200831.csv')
sales_09 = read_data('../input/orders/Order.all.20200901_20200930.csv')
sales_10 = read_data('../input/orders/Order.all.20201001_20201031.csv')

In [10]:
# only use the completed orders
def data_cleaning(df):
  completed_orders = df.filter(col('Order Status') == 'Completed')
  completed_orders = completed_orders.select('Username (Buyer)','Product Name','Quantity')
  return completed_orders

In [12]:
df_07 = data_cleaning(sales_07)
df_08 = data_cleaning(sales_08)
df_09 = data_cleaning(sales_09)
df_10 = data_cleaning(sales_10)

In [13]:
df = df_07.unionByName(df_08).unionByName(df_09).unionByName(df_10)

In [14]:
# transform user name to integer id
users = df.select('Username (Buyer)').distinct()
users = users.coalesce(1)
users = users.withColumn(
"userIntId", monotonically_increasing_id()).persist()
# users.show()

In [15]:
# transform product name to integer id
items = df.select('Product Name').distinct()
items = items.coalesce(1)
items = items.withColumn(
"itemIntId", monotonically_increasing_id()).persist()
# items.show()

In [16]:
sales_w_int_ids = df.join(
users, "Username (Buyer)", "left").join(items, "Product Name", "left")
# sales_w_int_ids.show()

In [17]:
sales_data = sales_w_int_ids.select(
                                        col("userIntId").alias("userId"),
                                        col("itemIntId").alias("itemId"),
                                        col("Quantity"))
# sales_data.show()

In [18]:
users = sales_data.select("userId").distinct()
items = sales_data.select("itemId").distinct()

cross_join = users.crossJoin(items).join(sales_data, ["userId", "itemId"], "left").fillna(0).persist()
# cross_join.show()

## Data sparsity

In [19]:
# Sparsity
sparsity = 1 - sales_data.count()/(users.count()*items.count())

In [20]:
print ("Sparsity: ", sparsity)

Sparsity:  0.97233688950839


## Build models

In [21]:
userCol = "userId"
itemCol = "itemId"
ratingCol = "Quantity"

In [22]:
# https://github.com/jamenlong/ALS_expected_percent_rank_cv/blob/master/ROEM_function.py

#Expected percentile rank error metric function
def ROEM(predictions, userCol = userCol, itemCol = itemCol, ratingCol = ratingCol):
  #Creates table that can be queried
  predictions.createOrReplaceTempView("predictions")

  #Sum of total number of purchases of all products
  denominator = predictions.groupBy().sum(ratingCol).collect()[0][0]

  #Calculating rankings of products predictions by user
  spark.sql("SELECT " + userCol + " , " + ratingCol + " , PERCENT_RANK() OVER (PARTITION BY " + userCol + " ORDER BY prediction DESC) AS rank FROM predictions").createOrReplaceTempView("rankings")

  #Multiplies the rank of each product by the number of purchases and adds the products together
  numerator = spark.sql('SELECT SUM(' + ratingCol + ' * rank) FROM rankings').collect()[0][0]

  performance = numerator/denominator

  return performance

In [23]:
(train, test) = cross_join.randomSplit([.8, .2], seed=12)

In [47]:
# Empty list to be filled with models
model_list = []
params_list = []
roems = []


ranks = [3,4,5]
maxIters = [11,12,13]
regParams = [0.005,0.01,0.015]
alphas = [9,10,11]

In [48]:
# For loop will automatically create and store ALS models
for r in ranks:
    for mi in maxIters:
        for rp in regParams:
            for a in alphas:
                params_list.append({'rank': r, 'maxIter': mi, 'regParam': rp, 'alpha': a})
                model_list.append(ALS(userCol= userCol, itemCol= itemCol, ratingCol= ratingCol, 
                                      rank = r, maxIter = mi, regParam = rp, alpha = a, 
                                      coldStartStrategy="drop",nonnegative = True, implicitPrefs = True))

In [49]:
len(params_list)

81

In [50]:
for model in model_list:
  # Fits each model to the training data
  trained_model = model.fit(train)
  # Generates test predictions
  predictions = trained_model.transform(test)
  # Evaluates each model's performance
  roems.append(ROEM(predictions))

In [51]:
df_params = pd.DataFrame(params_list)
df_params['ROEM'] = roems

In [52]:
df_params

Unnamed: 0,rank,maxIter,regParam,alpha,ROEM
0,3,11,0.005,9,0.017085
1,3,11,0.005,10,0.017085
2,3,11,0.005,11,0.017085
3,3,11,0.010,9,0.018059
4,3,11,0.010,10,0.017085
...,...,...,...,...,...
76,5,13,0.010,10,0.028449
77,5,13,0.010,11,0.028449
78,5,13,0.015,9,0.023378
79,5,13,0.015,10,0.024324


In [53]:
df_params.iloc[np.argmin(df_params.ROEM)]

rank         4.000000
maxIter     12.000000
regParam     0.010000
alpha       10.000000
ROEM         0.009318
Name: 40, dtype: float64

In [55]:
best_rank = df_params.iloc[np.argmin(df_params.ROEM)][0]
best_maxiter = df_params.iloc[np.argmin(df_params.ROEM)][1]
best_regparam = df_params.iloc[np.argmin(df_params.ROEM)][2]
best_alpha = df_params.iloc[np.argmin(df_params.ROEM)][3]

In [56]:
best_model = ALS(userCol= userCol, itemCol= itemCol, ratingCol= ratingCol, 
                                      rank = best_rank, maxIter = best_maxiter, regParam = best_regparam, alpha = best_alpha, 
                                      coldStartStrategy="drop",nonnegative = True, implicitPrefs = True)

In [57]:
fit_best_model = best_model.fit(cross_join)
predictions = fit_best_model.transform(cross_join)

In [58]:
predictions.take(10)

+------+------+--------+------------+
|userId|itemId|Quantity|  prediction|
+------+------+--------+------------+
|   148|    31|       0|  0.06389766|
|   463|    31|       0|6.0582574E-4|
|   471|    31|       0|6.0582574E-4|
|   496|    31|       0|         0.0|
|   243|    31|       0|6.0582574E-4|
|   392|    31|       0|  0.02451766|
|   540|    31|       0|6.0582574E-4|
|   623|    31|       0|         0.0|
|   737|    31|       0|   0.0659067|
|    31|    31|       0|6.0582574E-4|
|   516|    31|       0|6.0582574E-4|
|    85|    31|       0|         0.0|
|   137|    31|       0|  0.06555372|
|   251|    31|       0|         0.0|
|   451|    31|       0|         0.0|
|   580|    31|       0|6.0582574E-4|
|   808|    31|       0| 6.363547E-4|
|    65|    31|       0|6.0582574E-4|
|   458|    31|       0| 6.363547E-4|
|    53|    31|       0|6.0582574E-4|
+------+------+--------+------------+
only showing top 20 rows

