# Prediction Model for BNPL Revenue

In [42]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *

In [43]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

In [44]:
full = spark.read.parquet('../data/curated/train_data/')
full = full.withColumnRenamed('y_total_num_consumer', 'next_total_num_consumer').withColumnRenamed('y_total_revenue', 'next_total_revenue')\
    .withColumnRenamed('y_total_num_transaction', 'next_total_num_transaction')
full.printSchema()

root
 |-- merchant_abn: long (nullable = true)
 |-- total_num_consumer: long (nullable = true)
 |-- avg_dollar_value: double (nullable = true)
 |-- total_num_transaction: long (nullable = true)
 |-- mean_income: double (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- total_num_postcode: long (nullable = true)
 |-- tag: string (nullable = true)
 |-- next_total_num_consumer: long (nullable = true)
 |-- next_total_revenue: double (nullable = true)
 |-- next_total_num_transaction: long (nullable = true)



In [45]:
tag_mean = full.groupBy('tag')\
      .agg(
         F.mean("total_revenue").alias("mean_revenue_of_tags")
      )

In [46]:
tag_mean = tag_mean.toPandas()

In [49]:
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=4, random_state=0).fit(np.array(tag_mean['mean_revenue_of_tags']).reshape(-1, 1))
kmeans.labels_

array([2, 0, 0, 1, 2, 0, 3, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 2, 2, 1, 1, 2,
       1, 0], dtype=int32)

In [50]:
tag_mean['tag_labels'] = kmeans.labels_
tag_mean

Unnamed: 0,tag,mean_revenue_of_tags,tag_labels
0,jewelry,301542.063851,2
1,watch,565603.630545,0
2,cable,545919.891131,0
3,garden supply,732549.584612,1
4,antique,431668.237726,2
5,shoe,534436.300789,0
6,tent,995172.851759,3
7,stationery,518011.067448,0
8,artist supply,719151.356147,1
9,florists,637021.720045,1


In [51]:
tag_mean_sdf = spark.createDataFrame(tag_mean[['tag', 'tag_labels']])
tag_mean_sdf

tag,tag_labels
jewelry,2
watch,0
cable,0
garden supply,1
antique,2
shoe,0
tent,3
stationery,0
artist supply,1
florists,1


In [52]:
full = full.join(tag_mean_sdf, ["tag"], how="left") 
# use left join here since if no historical data is provided, we cannot predict the future value of a merchant
full

tag,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,next_total_num_consumer,next_total_revenue,next_total_num_transaction,tag_labels
jewelry,10596295795,8,10439.40181102842,8,61840.875,a,571244.0798428855,8,,,,2
watch,10187291046,87,111.08408713922158,87,61060.0459770115,b,31795.597893195016,87,99.0,41683.21121325837,100.0,0
watch,10264435225,1238,114.10783402533237,1272,62006.31132075472,c,346896.9592900661,1018,1519.0,435003.6795629895,1566.0,0
watch,10922217544,18,163.5626661571798,18,63804.22222222222,c,4946.134870167458,18,19.0,5880.61894060871,19.0,0
shoe,10955677986,196,224.31663343377568,197,62081.20304568528,a,249233.7191755476,191,232.0,311007.83981679846,235.0,0
tent,10651113986,17,537.592173774402,17,57981.94117647059,b,29701.967601035707,17,24.0,40997.39033296765,24.0,3
stationery,10618089367,903,382.6779937277748,919,63214.549510337325,b,1410241.1961990686,783,994.0,1427868.1901742313,1011.0,0
artist supply,10463252268,22,464.0964976850653,22,60070.77272727273,a,67488.91405656068,22,26.0,78474.65405470507,26.0,1
florists,10545955006,108,475.27264003873785,108,63127.56481481482,a,316189.37413271976,106,133.0,359500.80479674053,133.0,1
music,10364012396,4,276.08689369891994,4,81123.75,b,4008.7818228908673,4,16.0,19636.79081402693,16.0,0


## Model for BNPL Revenue
Features:
    total number of consumer, average dollar value, total number of transaction, mean income, total number of postcode, tags

label:
    next year revenue

In [53]:
# drop the columns not needed
revenue_df = full.drop('merchant_abn', 'revenue_level', 'total_revenue', 'next_total_num_consumer', 'next_total_num_transaction')
revenue_df

tag,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,total_num_postcode,next_total_revenue,tag_labels
jewelry,8,10439.40181102842,8,61840.875,8,,2
watch,87,111.08408713922158,87,61060.0459770115,87,41683.21121325837,0
watch,1238,114.10783402533237,1272,62006.31132075472,1018,435003.6795629895,0
watch,18,163.5626661571798,18,63804.22222222222,18,5880.61894060871,0
shoe,196,224.31663343377568,197,62081.20304568528,191,311007.83981679846,0
tent,17,537.592173774402,17,57981.94117647059,17,40997.39033296765,3
stationery,903,382.6779937277748,919,63214.549510337325,783,1427868.1901742313,0
artist supply,22,464.0964976850653,22,60070.77272727273,22,78474.65405470507,1
florists,108,475.27264003873785,108,63127.56481481482,106,359500.80479674053,1
music,4,276.08689369891994,4,81123.75,4,19636.79081402693,0


In [54]:
# change tags into numeric feature by one hot encoding
indexer = StringIndexer(inputCol="tag", outputCol="tagIndex")
revenue_df = indexer.fit(revenue_df).transform(revenue_df)
ohe = OneHotEncoder(inputCol="tagIndex", outputCol="tagOHE")
revenue_df = ohe.fit(revenue_df).transform(revenue_df)
revenue_df

tag,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,total_num_postcode,next_total_revenue,tag_labels,tagIndex,tagOHE
jewelry,8,10439.40181102842,8,61840.875,8,,2,23.0,"(23,[],[])"
jewelry,1,6987.246435378608,1,48235.0,1,,2,23.0,"(23,[],[])"
jewelry,2,1396.3251261623384,2,54250.5,2,,2,23.0,"(23,[],[])"
jewelry,1,4798.332815388768,1,80991.0,1,,2,23.0,"(23,[],[])"
jewelry,33,9848.725593936158,33,60891.90909090909,33,6996.210950909105,2,23.0,"(23,[],[])"
jewelry,29,14897.926207832394,29,62719.72413793104,28,,2,23.0,"(23,[],[])"
jewelry,3,3622.567091022215,3,66842.66666666667,3,5551.664760915629,2,23.0,"(23,[],[])"
jewelry,1,19486.76358643924,1,70738.0,1,,2,23.0,"(23,[],[])"
jewelry,5,4844.117193121709,5,64499.4,5,2973.525203961843,2,23.0,"(23,[],[])"
jewelry,3,15354.649596808333,3,55943.66666666666,3,,2,23.0,"(23,[],[])"


In [55]:
import six
for i in revenue_df.columns[:-1]:
    if not( isinstance(revenue_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to next_total_revenue for ", i, revenue_df.stat.corr('next_total_revenue',i))

Correlation to next_total_revenue for  total_num_consumer 0.7410793330888668
Correlation to next_total_revenue for  avg_dollar_value -0.09721738918447717
Correlation to next_total_revenue for  total_num_transaction 0.6509905314160019
Correlation to next_total_revenue for  mean_income 0.014726025333045787
Correlation to next_total_revenue for  total_num_postcode 0.6656045508317902
Correlation to next_total_revenue for  next_total_revenue 1.0
Correlation to next_total_revenue for  tag_labels -0.028666428658009408
Correlation to next_total_revenue for  tagIndex -0.10513625584784485


### Vectorization

In [56]:
features = ['total_num_consumer', 'avg_dollar_value', 'total_num_transaction', 'mean_income', 'total_num_postcode', 'tagOHE']
assembler = VectorAssembler(inputCols=features, outputCol='features')
final_revenue_df = assembler.transform(revenue_df)
final_revenue_df = final_revenue_df.select('features','next_total_revenue')

### Model fitting

In [94]:
# missing values will not be included
train_df, test_df = (final_revenue_df.drop('tag', 'tagIndex').filter(F.col('next_total_revenue').isNotNull())).randomSplit([0.7, 0.3])

In [84]:
train_df, test_df = (final_revenue_df.filter((F.col('next_total_revenue').isNotNull())&(F.col('tag_labels')==1))).drop('tag', 'tagIndex', 'tag_labels').randomSplit([0.7, 0.3])

In [95]:
train_df.count(), test_df.count()

(2665, 1119)

In [93]:
train_df

features,next_total_revenue
"(28,[0,1,2,3,4,5]...",2117.788746959762
"(28,[0,1,2,3,4,5]...",2624.8015545185413
"(28,[0,1,2,3,4,5]...",17257.12863751407
"(28,[0,1,2,3,4,5]...",24217.897520994055
"(28,[0,1,2,3,4,5]...",53248.10549365473
"(28,[0,1,2,3,4,5]...",9662.469562619945
"(28,[0,1,2,3,4,5]...",38692.67439389365
"(28,[0,1,2,3,4,5]...",18686.29029154868
"(28,[0,1,2,3,4,5]...",22074.99856066441
"(28,[0,1,2,3,4,5]...",22465.136709436512


#### Linear Regression

In [70]:
lr = LinearRegression(labelCol='next_total_revenue', maxIter=10, regParam=0.3, elasticNetParam=0.8)
fitted_model = lr.fit(train_df)
fitted_model.setFeaturesCol("features")
fitted_model.setPredictionCol("prediction")

LinearRegressionModel: uid=LinearRegression_6e451e88950d, numFeatures=28

In [71]:
print("Coefficients: %s" % str(fitted_model.coefficients))
print("Intercept: %s" % str(fitted_model.intercept))

Coefficients: [147.97843327117232,0.0,85.32570319599924,-0.18986513944769598,742.9054500956163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3877.6635050366026,26675.10544928082,36123.76329516014,-40682.64170626974]
Intercept: 5998.93209095228


In [72]:
trainingSummary = fitted_model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 10
objectiveHistory: [0.5, 0.4385975131930148, 0.28474438258461315, 0.2438504274460106, 0.2320701436019277, 0.22791134986155795, 0.22516858238985782, 0.2223296969928053, 0.2185053253088702, 0.21622820881221053, 0.21353154419677323]
+------------------+
|         residuals|
+------------------+
| 7432.119075475288|
|2346.3565332611465|
|  8554.88947340806|
|  34323.6694849341|
|12593.180083608859|
|3102.1682635621364|
|15897.226152553725|
| 2322.704138401477|
| 7172.715790443264|
|2026.8890543954844|
|7144.1179169566485|
|20944.705869157704|
|281.43900160423595|
| 6544.546269235913|
| -512.605162724487|
| 9765.170990116087|
|-559.7171232905239|
|7021.5244212037505|
|18164.623668186414|
|-34480.81848949313|
+------------------+
only showing top 20 rows

RMSE: 218965.638903
r2: 0.572938


In [73]:
train_df.describe().show()

+-------+------------------+
|summary|next_total_revenue|
+-------+------------------+
|  count|               260|
|   mean|115747.37453880043|
| stddev| 335712.4942268983|
|    min|146.56267111256705|
|    max| 2578899.989549712|
+-------+------------------+



##### Linear Regression Evaluation

In [74]:
lr_predictions = fitted_model.transform(test_df)
lr_predictions.select("prediction","next_total_revenue","features").show(10)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="next_total_revenue",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+-------------------+------------------+--------------------+
|         prediction|next_total_revenue|            features|
+-------------------+------------------+--------------------+
|-2507.8625773395534| 700.0913339069108|(28,[0,1,2,3,4],[...|
| -3763.531377081882| 5551.664760915629|(28,[0,1,2,3,4],[...|
| -3820.870649195086|15361.340843110367|(28,[0,1,2,3,4],[...|
| -1093.123507037396| 6330.006413055507|(28,[0,1,2,3,4],[...|
| 439.78965489998154|12730.592472437624|(28,[0,1,2,3,4],[...|
|-1366.2075515265024| 2973.525203961843|(28,[0,1,2,3,4],[...|
| 1848.6737562512335| 7714.789567461943|(28,[0,1,2,3,4],[...|
| 3023.4144696979356|3284.6497179037897|(28,[0,1,2,3,4],[...|
|   5876.74802949784| 71807.61440922701|(28,[0,1,2,3,4],[...|
|  5676.725105089693|3364.4301006651253|(28,[0,1,2,3,4],[...|
+-------------------+------------------+--------------------+
only showing top 10 rows

R Squared (R2) on test data = 0.613892


In [106]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(labelCol='next_total_revenue', family="gamma", link="inverse", maxIter=10, regParam=0.3)
model = glr.fit(train_df)
summary = model.summary

In [107]:
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [-5.081981829145639e+35,-2.6870343238436396e+34,6.678906219871052e+34,-2.163592960903312e+33,-2.294703291151991e+34,-4.92718762286847e+36,8.400747166426864e+36,-5.605131552930673e+36,-1.7198318993286945e+36,3.2235886586410473e+36,-2.158475653750893e+35,-3.7379337761048877e+36,4.818852487444756e+36,3.9813588208546056e+36,4.080970583519597e+36,-4.676779563277705e+35,-3.0601522088596924e+36,-3.286057039141181e+36,-2.873629466789348e+36,-3.8704207786982195e+36,-1.757855944350198e+36,2.783813584487964e+36,-2.069366545011462e+36,6.517096251093483e+35,1.4196427137124834e+36,1.2734316058753377e+36,7.62234692965126e+35,8.931461421116118e+35]
Intercept: 9.011220949745152e+37


In [108]:
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()

Coefficient Standard Errors: [1.0313239414198771e+34, 3.478628089243441e+33, 3.5683495996081674e+33, 1.0816705970704932e+33, 1.4503718984533777e+34, 5.539912749322146e+36, 5.893721277193464e+36, 5.844093458400502e+36, 5.876675126188564e+36, 5.87325712140477e+36, 5.848886994636668e+36, 5.868749562497159e+36, 5.855177161171895e+36, 5.877248737248757e+36, 5.896620810168444e+36, 5.903887371388085e+36, 5.905115307859655e+36, 5.904614198395587e+36, 5.937275150656821e+36, 5.911421045557285e+36, 5.938595972689419e+36, 5.938813105595627e+36, 5.954762089698656e+36, 5.961939652059689e+36, 6.033865684468139e+36, 6.07838891654262e+36, 6.110853304354673e+36, 6.151512087390794e+36, 6.8105032215591325e+37]
T Values: [-49.276290649754635, -7.724408171579033, 18.71707363147503, -2.000232757332045, -1.5821482018501438, -0.8893980547746626, 1.4253723193417152, -0.9591105263509541, -0.2926539007855835, 0.5488587664403202, -0.036904040986433476, -0.6369216706726183, 0.8230071191356194, 0.6774188057792411, 0

#### Random Forest Regressor

In [101]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [102]:
rf = RandomForestRegressor(numTrees = 10, maxDepth = 10, labelCol='next_total_revenue')

In [103]:
model = rf.fit(train_df)
result = model.transform(test_df)

In [104]:
rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="mae", predictionCol='prediction')
mae = rf_evaluator.evaluate(result)
rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="r2", predictionCol='prediction')
r2 = rf_evaluator.evaluate(result)
print('+++++++++++++++++++++++++++++++++++++++++++')
print(f'Using Categorical feature: {features}')
print('mae:{}'.format(mae))
print('r2: {}'.format(r2))

+++++++++++++++++++++++++++++++++++++++++++
Using Categorical feature: ['total_num_consumer', 'avg_dollar_value', 'total_num_transaction', 'mean_income', 'total_num_postcode', 'tagOHE']
mae:305911.1848229039
r2: 0.5990584247955485


In [105]:
result.select("prediction","next_total_revenue","features").show(20)

+------------------+------------------+--------------------+
|        prediction|next_total_revenue|            features|
+------------------+------------------+--------------------+
| 9853.891544733393| 5551.664760915629|(28,[0,1,2,3,4],[...|
| 9232.881639069303| 7714.789567461943|(28,[0,1,2,3,4],[...|
|40613.649820138045|39997.213284193116|(28,[0,1,2,3,4],[...|
|139927.58291571058|21404.354662000824|(28,[0,1,2,3,4],[...|
|27048.383025172265| 21214.30272082452|(28,[0,1,2,3,4,5]...|
|23631.234895597558| 38692.67439389365|(28,[0,1,2,3,4,5]...|
|27895.020971600858| 22074.99856066441|(28,[0,1,2,3,4,5]...|
|28025.083524159854| 75683.95506697436|(28,[0,1,2,3,4,5]...|
| 29028.92472391144|25826.219229145478|(28,[0,1,2,3,4,5]...|
| 38457.01274226319|28967.859022303237|(28,[0,1,2,3,4,5]...|
|59881.001715998085|18736.186917799685|(28,[0,1,2,3,4,5]...|
|58545.640659440716|22537.278176372223|(28,[0,1,2,3,4,5]...|
|65389.267248563505|32046.023857037744|(28,[0,1,2,3,4,5]...|
| 38314.06821998696| 263

In [19]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'next_total_revenue', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'next_total_revenue', 'features').show(5)

+------------------+------------------+--------------------+
|        prediction|next_total_revenue|            features|
+------------------+------------------+--------------------+
|19566.260698153033| 700.0913339069108|(28,[0,1,2,3,4],[...|
| 46132.53320660531|20115.019410151996|(28,[0,1,2,3,4],[...|
| 46132.53320660531|25665.858312655597|(28,[0,1,2,3,4],[...|
|19566.260698153033|24217.897520994058|(28,[0,1,2,3,4,5]...|
|34625.277661725544| 7335.947872319652|(28,[0,1,2,3,4,5]...|
+------------------+------------------+--------------------+
only showing top 5 rows



In [20]:
gbt_evaluator = RegressionEvaluator(
    labelCol="next_total_revenue", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 932610


In [109]:
full

tag,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,next_total_num_consumer,next_total_revenue,next_total_num_transaction,tag_labels
jewelry,10596295795,8,10439.40181102842,8,61840.875,a,571244.0798428855,8,,,,2
watch,10187291046,87,111.08408713922158,87,61060.0459770115,b,31795.597893195016,87,99.0,41683.21121325837,100.0,0
watch,10264435225,1238,114.10783402533237,1272,62006.31132075472,c,346896.9592900661,1018,1519.0,435003.6795629895,1566.0,0
watch,10922217544,18,163.5626661571798,18,63804.22222222222,c,4946.134870167458,18,19.0,5880.61894060871,19.0,0
shoe,10955677986,196,224.31663343377568,197,62081.20304568528,a,249233.7191755476,191,232.0,311007.83981679846,235.0,0
tent,10651113986,17,537.592173774402,17,57981.94117647059,b,29701.967601035707,17,24.0,40997.39033296765,24.0,3
stationery,10618089367,903,382.6779937277748,919,63214.549510337325,b,1410241.1961990686,783,994.0,1427868.1901742313,1011.0,0
artist supply,10463252268,22,464.0964976850653,22,60070.77272727273,a,67488.91405656068,22,26.0,78474.65405470507,26.0,1
florists,10545955006,108,475.27264003873785,108,63127.56481481482,a,316189.37413271976,106,133.0,359500.80479674053,133.0,1
music,10364012396,4,276.08689369891994,4,81123.75,b,4008.7818228908673,4,16.0,19636.79081402693,16.0,0
