# Prediction Model for BNPL Revenue

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "2g")
    .config("spark.executer.memory", "4g")
    .getOrCreate()
)

22/09/23 21:36:13 WARN Utils: Your hostname, DESKTOP-80AOBLL resolves to a loopback address: 127.0.1.1; using 172.25.24.208 instead (on interface eth0)
22/09/23 21:36:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/23 21:36:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/23 21:36:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
full = spark.read.parquet('../data/curated/train_data/')
full = full.withColumnRenamed('y_total_num_consumer', 'next_total_num_consumer').withColumnRenamed('y_total_revenue', 'next_total_revenue')\
    .withColumnRenamed('y_total_num_transaction', 'next_total_num_transaction')
full.printSchema()

                                                                                

root
 |-- merchant_abn: long (nullable = true)
 |-- total_num_consumer: long (nullable = true)
 |-- avg_dollar_value: double (nullable = true)
 |-- total_num_transaction: long (nullable = true)
 |-- mean_income: double (nullable = true)
 |-- revenue_level: string (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- total_num_postcode: long (nullable = true)
 |-- tag: string (nullable = true)
 |-- next_total_num_consumer: long (nullable = true)
 |-- next_total_revenue: double (nullable = true)
 |-- next_total_num_transaction: long (nullable = true)



## Model for BNPL Revenue
Features:
    total number of consumer, average dollar value, total number of transaction, mean income, total number of postcode, tags

label:
    next year revenue

In [4]:
# drop the columns not needed
revenue_df = full.drop('merchant_abn', 'revenue_level', 'total_revenue', 'next_total_num_consumer', 'next_total_num_transaction')
revenue_df

total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,total_num_postcode,tag,next_total_revenue
808,209.0251604007696,823,62506.642770352366,719,furniture,38940.2914088986
731,41.11965938159281,750,62693.954666666665,642,cable,154467.92091896126
87,111.0840871392216,87,61060.0459770115,87,watch,41683.21121325837
107,451.1432080236008,107,65637.3831775701,107,music,290071.0062352741
2244,39.20921658203575,2370,62122.52278481013,1619,gift,666766.4067054291
218,389.5552654520502,218,63146.61926605504,211,computer,378005.14673149673
1238,114.10783402533234,1272,62006.31132075472,1018,watch,435003.6795629895
130,308.3515003987336,130,63676.28461538462,127,computer,273146.7707118801
2475,128.98933167858436,2621,62792.28691339184,1730,furniture,2765582.9188776133
238,378.0169713940927,238,61579.6512605042,227,computer,654693.9778432944


In [5]:
# change tags into numeric feature by one hot encoding
indexer = StringIndexer(inputCol="tag", outputCol="tagIndex")
revenue_df = indexer.fit(revenue_df).transform(revenue_df)
ohe = OneHotEncoder(inputCol="tagIndex", outputCol="tagOHE")
revenue_df = ohe.fit(revenue_df).transform(revenue_df)
revenue_df

                                                                                

total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,total_num_postcode,tag,next_total_revenue,tagIndex,tagOHE
808,209.0251604007696,823,62506.642770352366,719,furniture,38940.2914088986,5.0,"(23,[5],[1.0])"
731,41.11965938159281,750,62693.954666666665,642,cable,154467.92091896126,8.0,"(23,[8],[1.0])"
87,111.0840871392216,87,61060.0459770115,87,watch,41683.21121325837,9.0,"(23,[9],[1.0])"
107,451.1432080236008,107,65637.3831775701,107,music,290071.0062352741,11.0,"(23,[11],[1.0])"
2244,39.20921658203575,2370,62122.52278481013,1619,gift,666766.4067054291,4.0,"(23,[4],[1.0])"
218,389.5552654520502,218,63146.61926605504,211,computer,378005.14673149673,0.0,"(23,[0],[1.0])"
1238,114.10783402533234,1272,62006.31132075472,1018,watch,435003.6795629895,9.0,"(23,[9],[1.0])"
130,308.3515003987336,130,63676.28461538462,127,computer,273146.7707118801,0.0,"(23,[0],[1.0])"
2475,128.98933167858436,2621,62792.28691339184,1730,furniture,2765582.9188776133,5.0,"(23,[5],[1.0])"
238,378.0169713940927,238,61579.6512605042,227,computer,654693.9778432944,0.0,"(23,[0],[1.0])"


In [39]:
import six
for i in revenue_df.columns[:-1]:
    if not( isinstance(revenue_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to next_total_revenue for ", i, revenue_df.stat.corr('next_total_revenue',i))

Correlation to next_total_revenue for  total_num_consumer 0.7410130770929829
Correlation to next_total_revenue for  avg_dollar_value -0.09722654487146358
Correlation to next_total_revenue for  total_num_transaction 0.6509065671273128
Correlation to next_total_revenue for  mean_income 0.01470080484608678
Correlation to next_total_revenue for  total_num_postcode 0.6656216278730818
Correlation to next_total_revenue for  next_total_revenue 1.0
Correlation to next_total_revenue for  tagIndex -0.10497611182613972


### Vectorization

In [15]:
features = ['total_num_consumer', 'avg_dollar_value', 'total_num_transaction', 'mean_income', 'total_num_postcode', 'tagOHE']
assembler = VectorAssembler(inputCols=features, outputCol='features')
final_revenue_df = assembler.transform(revenue_df)
final_revenue_df = final_revenue_df.select('features','next_total_revenue')

### Model fitting

In [41]:
# missing values will not be included
train_df, test_df = (final_revenue_df.drop('tag', 'tagIndex').filter(F.col('next_total_revenue').isNotNull())).randomSplit([0.7, 0.3])

#### Linear Regression

In [42]:
lr = LinearRegression(labelCol='next_total_revenue', maxIter=10, regParam=0.3, elasticNetParam=0.8)
fitted_model = lr.fit(train_df)
fitted_model.setFeaturesCol("features")
fitted_model.setPredictionCol("prediction")

LinearRegressionModel: uid=LinearRegression_e58ea41e8061, numFeatures=28

In [43]:
print("Coefficients: %s" % str(fitted_model.coefficients))
print("Intercept: %s" % str(fitted_model.intercept))

Coefficients: [337.0818632492976,93.5595694197587,47.33005229916656,5.286080298065325,773.0387862316394,244795.76863320876,-592286.9249628787,338288.30635431036,153267.77835276004,-252818.12827613624,26109.22698751017,254563.31523117996,-325576.25250535883,-411139.1924490009,-295788.5817926422,57864.74180526618,290491.83717791596,325326.22211433906,254066.52256990917,321016.2885688578,217442.9822779286,-161928.2952368918,411919.3148507102,51499.64962162176,-105660.63831199217,-115772.91441051033,-81333.39855837173,-244234.93225291188]
Intercept: -417760.2079404238


In [44]:
trainingSummary = fitted_model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 10
objectiveHistory: [0.5000000000000001, 0.44321933276509634, 0.27806217813371303, 0.23386401026686682, 0.22101246026480495, 0.21591106854388864, 0.2135995248588485, 0.21252502569687945, 0.2115934029517963, 0.2102398452611553, 0.21000704198004813]
+-------------------+
|          residuals|
+-------------------+
| -63451.67327323649|
|-232839.85795768027|
| -702187.3937950714|
| -56465.27390157509|
| -338471.6722439125|
|  -795404.555093281|
|-166298.30930010582|
| -379216.0467488667|
| -511161.2720112392|
|-405708.03186332976|
| -921228.9840909551|
| -568622.2433716478|
| -80100.48102615046|
| -321696.5808669223|
|-182054.49108906087|
| -410413.4029315987|
| -471974.2675065782|
|-207761.85405386062|
| -373892.8436053331|
| 11105.614446272473|
+-------------------+
only showing top 20 rows

RMSE: 1075375.730507
r2: 0.579986


In [45]:
train_df.describe().show()

+-------+------------------+
|summary|next_total_revenue|
+-------+------------------+
|  count|              2630|
|   mean| 650925.0130153717|
| stddev| 1659629.439120691|
|    min| 79.84148408949656|
|    max|1.92370744496674E7|
+-------+------------------+



##### Linear Regression Evaluation

In [46]:
lr_predictions = fitted_model.transform(test_df)
lr_predictions.select("prediction","next_total_revenue","features").show(5)
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="next_total_revenue",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+-------------------+------------------+--------------------+
|         prediction|next_total_revenue|            features|
+-------------------+------------------+--------------------+
|-122672.43666569167| 700.0913339069108|(28,[0,1,2,3,4],[...|
| 450677.85836874886|  814.625554790352|(28,[0,1,2,3,4],[...|
|  627738.3668494793| 6330.006413055507|(28,[0,1,2,3,4],[...|
|  233105.9482293317| 7714.789567461943|(28,[0,1,2,3,4],[...|
| 388118.40433603484| 22746.73586888107|(28,[0,1,2,3,4],[...|
+-------------------+------------------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.548233


#### Decission Tree