# Prediction Model for BNPL Revenue

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

## Data Preprocessing

In [2]:
full = pd.read_parquet('../data/curated/train_data/')
full = full.rename({'y_total_num_consumer': 'next_total_num_consumer', 'y_total_revenue': 'next_total_revenue', 'y_total_num_transaction': 'next_total_num_transaction'}, axis = 1)
full.dtypes

merchant_abn                    int64
total_num_consumer              int64
avg_dollar_value              float64
total_num_transaction           int64
mean_income                   float64
revenue_level                  object
total_revenue                 float64
total_num_postcode              int64
tag                            object
next_total_num_consumer       float64
next_total_revenue            float64
next_total_num_transaction    float64
dtype: object

### Clustering

tag_mean = full.groupBy('tag')\
      .agg(
         F.mean("total_revenue").alias("mean_revenue_of_tags")
      )

tag_mean = tag_mean.toPandas()

from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=4, random_state=0).fit(np.array(tag_mean['mean_revenue_of_tags']).reshape(-1, 1))
kmeans.labels_

tag_mean['tag_labels'] = kmeans.labels_
tag_mean

tag_mean_sdf = spark.createDataFrame(tag_mean[['tag', 'tag_labels']])
tag_mean_sdf

full = full.join(tag_mean_sdf, ["tag"], how="left") 
# use left join here since if no historical data is provided, we cannot predict the future value of a merchant
full

## Model for BNPL Revenue
Features:
    total number of consumer, average dollar value, total number of transaction, mean income, total number of postcode, tags

label:
    next year revenue

In [3]:
# drop the columns not needed
revenue_df = full[['tag', 'total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode', 'next_total_revenue']].dropna()
revenue_df

Unnamed: 0,tag,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,next_total_revenue
0,furniture,808,823,3.096499e+04,719,3.894029e+04
1,cable,731,750,1.301437e+05,642,1.544679e+05
2,music,107,107,3.055638e+05,107,2.900710e+05
3,gift,2244,2370,5.891499e+05,1619,6.667664e+05
4,computer,130,130,2.288893e+05,127,2.731468e+05
...,...,...,...,...,...,...
3948,hobby,206,207,2.023275e+06,200,4.463707e+05
3949,cable,77,77,2.488764e+04,77,3.168081e+04
3950,digital goods,1464,1506,1.030649e+06,1164,1.225969e+06
3951,opticians,3920,4285,5.961703e+05,2255,7.204531e+05


### Indexing and One-hot Encoding

In [4]:
# change tags into numeric feature by one hot encoding
enc = OneHotEncoder(handle_unknown='ignore')
tag_ohe = enc.fit_transform([[i] for i in revenue_df['tag']]).toarray()
tag_ohe

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
revenue_df[enc.categories_[0]] = tag_ohe
revenue_df = revenue_df.dropna()
revenue_df

Unnamed: 0,tag,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,next_total_revenue,antique,art dealer,artist supply,bicycle,...,hobby,jewelry,motor,music,opticians,shoe,stationery,telecom,tent,watch
0,furniture,808,823,3.096499e+04,719,3.894029e+04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,cable,731,750,1.301437e+05,642,1.544679e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,music,107,107,3.055638e+05,107,2.900710e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,gift,2244,2370,5.891499e+05,1619,6.667664e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,computer,130,130,2.288893e+05,127,2.731468e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,hobby,206,207,2.023275e+06,200,4.463707e+05,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,cable,77,77,2.488764e+04,77,3.168081e+04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,digital goods,1464,1506,1.030649e+06,1164,1.225969e+06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3951,opticians,3920,4285,5.961703e+05,2255,7.204531e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


#### Linear Regression

In [6]:
features = ['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode']
X = np.array(revenue_df[features])
y = np.array(revenue_df['next_total_revenue'])
reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.9138791991547689

In [7]:
# missing values will not be included
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
fitted_model.score(X_test, y_test)

0.9415932753962846

In [9]:
print("Coefficients: %s" % str(fitted_model.coef_))
print("Intercept: %s" % str(fitted_model.intercept_))

Coefficients: [196.23347542 -13.83054958   0.8314655   26.42734243]
Intercept: -43132.43989124498


In [10]:
lr_predictions = fitted_model.predict(X_test)
lr_predictions

array([ 882401.20348866,   -8315.52860467,  -35557.19474653, ...,
       1911531.64463741,  -12551.07940471, 1423593.51173547])

In [11]:
show_result = pd.DataFrame({'y_pred': lr_predictions, 'y_true': y_test})
show_result

Unnamed: 0,y_pred,y_true
0,8.824012e+05,7.914105e+05
1,-8.315529e+03,1.920798e+04
2,-3.555719e+04,5.991649e+03
3,9.665839e+05,3.663320e+05
4,1.581418e+05,1.254175e+05
...,...,...
1131,-1.759834e+04,2.455700e+04
1132,8.743362e+05,1.011096e+06
1133,1.911532e+06,1.659757e+06
1134,-1.255108e+04,3.990979e+04


In [12]:
print("r2 score: ", r2_score(lr_predictions, y_test))
print("Mean Absolute Error: ", mean_absolute_error(lr_predictions, y_test))

r2 score:  0.9268186186560461
Mean Absolute Error:  180018.52856016337


#### Random Forest Regressor

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

rf = RandomForestRegressor(numTrees = 10, maxDepth = 10, labelCol='next_total_revenue')

model = rf.fit(train_df)
result = model.transform(test_df)

rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="mae", predictionCol='prediction')
mae = rf_evaluator.evaluate(result)
rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="r2", predictionCol='prediction')
r2 = rf_evaluator.evaluate(result)
print('+++++++++++++++++++++++++++++++++++++++++++')
print(f'Using Categorical feature: {features}')
print('mae:{}'.format(mae))
print('r2: {}'.format(r2))

result.select("prediction","next_total_revenue","features").show(20)

#### Gradient Boosting Tree

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'next_total_revenue', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'next_total_revenue', 'features').show(5)

gbt_evaluator = RegressionEvaluator(
    labelCol="next_total_revenue", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

#### Neural Network with full Features

In [13]:
revenue_df = revenue_df.drop(columns = 'tag')

In [14]:
X_train, X_test, y_train, y_test = train_test_split(revenue_df.loc[:, revenue_df.columns != 'next_total_revenue'], revenue_df['next_total_revenue'], test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((2648, 28), (1136, 28))

In [15]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

((2648, 28), (1136, 28))

In [16]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.9920119533292459


In [17]:
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_result.head()

Unnamed: 0,Actual,Predicted
558,791410.48815,779728.676977
546,19207.983183,15188.471175
2308,5991.64879,11770.005382
2996,366331.976097,296494.99277
807,125417.488383,122897.736047


In [18]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('r2 Score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 57965.328468027736
r2 Score: 0.9915130006504347


### Neural Network with selected Features

In [19]:
full

Unnamed: 0,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag,next_total_num_consumer,next_total_revenue,next_total_num_transaction
0,10023283211,808,209.025160,823,62506.642770,e,3.096499e+04,719,furniture,980.0,3.894029e+04,1002.0
1,10142254217,731,41.119659,750,62693.954667,b,1.301437e+05,642,cable,918.0,1.544679e+05,925.0
2,10192359162,107,451.143208,107,65637.383178,a,3.055638e+05,107,music,107.0,2.900710e+05,107.0
3,10206519221,2244,39.209217,2370,62122.522785,a,5.891499e+05,1619,gift,2662.0,6.667664e+05,2811.0
4,10279061213,130,308.351500,130,63676.284615,a,2.288893e+05,127,computer,161.0,2.731468e+05,161.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3948,99845294286,206,1705.807601,207,63777.541063,a,2.023275e+06,200,hobby,94.0,4.463707e+05,95.0
3949,99861963809,77,112.618854,77,66267.415584,c,2.488764e+04,77,cable,92.0,3.168081e+04,92.0
3950,99904689266,1464,102.911540,1506,63133.881142,a,1.030649e+06,1164,digital goods,1669.0,1.225969e+06,1739.0
3951,99938978285,3920,30.917688,4285,62782.274912,b,5.961703e+05,2255,opticians,4614.0,7.204531e+05,5104.0


In [20]:
revenue_df = full[['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode', 'next_total_revenue']].dropna()

In [21]:
X_train, X_test, y_train, y_test = train_test_split(revenue_df.loc[:, revenue_df.columns != 'next_total_revenue'], revenue_df['next_total_revenue'], test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((2648, 4), (1136, 4))

In [22]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

((2648, 4), (1136, 4))

In [23]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.9925304875830819


In [24]:
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_result.head()

Unnamed: 0,Actual,Predicted
558,791410.48815,794999.312099
546,19207.983183,40449.79483
2308,5991.64879,12030.082371
2996,366331.976097,535432.138932
807,125417.488383,150132.722725


In [25]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('r2 Score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 60109.31774047939
r2 Score: 0.9922984742656825
