# Prediction Model for BNPL Revenue

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

## Data Preprocessing

In [2]:
full = pd.read_csv('../data/curated/full_consumer.csv').drop(columns = 'Unnamed: 0')
full = full.rename({'y_total_num_consumer': 'next_total_num_consumer', 'y_total_revenue': 'next_total_revenue', 'y_total_num_transaction': 'next_total_num_transaction'}, axis = 1)
full.dtypes

merchant_abn                    int64
total_num_consumer              int64
avg_dollar_value              float64
total_num_transaction           int64
mean_income                   float64
revenue_level                  object
total_revenue                 float64
total_num_postcode              int64
tag                            object
next_total_num_consumer       float64
next_total_revenue            float64
next_total_num_transaction    float64
dtype: object

In [3]:
full.isnull().sum()

merchant_abn                    0
total_num_consumer              0
avg_dollar_value                0
total_num_transaction           0
mean_income                     0
revenue_level                   0
total_revenue                   0
total_num_postcode              0
tag                             0
next_total_num_consumer         0
next_total_revenue            171
next_total_num_transaction    171
dtype: int64

### Clustering

tag_mean = full.groupBy('tag')\
      .agg(
         F.mean("total_revenue").alias("mean_revenue_of_tags")
      )

tag_mean = tag_mean.toPandas()

from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=4, random_state=0).fit(np.array(tag_mean['mean_revenue_of_tags']).reshape(-1, 1))
kmeans.labels_

tag_mean['tag_labels'] = kmeans.labels_
tag_mean

tag_mean_sdf = spark.createDataFrame(tag_mean[['tag', 'tag_labels']])
tag_mean_sdf

full = full.join(tag_mean_sdf, ["tag"], how="left") 
# use left join here since if no historical data is provided, we cannot predict the future value of a merchant
full

## Model for BNPL Revenue
Features:
    total number of consumer, average dollar value, total number of transaction, mean income, total number of postcode, tags

label:
    next year revenue

In [4]:
# drop the columns not needed
revenue_df = full[['tag', 'total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode', 'next_total_revenue']]
revenue_df

Unnamed: 0,tag,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,next_total_revenue
0,furniture,808,823,3.096499e+04,719,3.894029e+04
1,cable,731,750,1.301437e+05,642,1.544679e+05
2,watch,87,87,3.179560e+04,87,4.168321e+04
3,music,107,107,3.055638e+05,107,2.900710e+05
4,gift,2244,2370,5.891499e+05,1619,6.667664e+05
...,...,...,...,...,...,...
3948,opticians,3920,4285,5.961703e+05,2255,7.204531e+05
3949,books,33,33,3.339252e+04,33,3.991901e+04
3950,shoe,5353,6027,5.944143e+06,2607,6.974851e+06
3951,motor,45,45,1.084952e+05,45,1.762053e+05


### Indexing and One-hot Encoding

In [5]:
# change tags into numeric feature by one hot encoding
enc = OneHotEncoder(handle_unknown='ignore')
tag_ohe = enc.fit_transform([[i] for i in revenue_df['tag']]).toarray()
tag_ohe

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
revenue_df[enc.categories_[0]] = tag_ohe
revenue_df_full = revenue_df
revenue_df = revenue_df.dropna()
revenue_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  revenue_df[enc.categories_[0]] = tag_ohe
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  revenue_df[enc.categories_[0]] = tag_ohe
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  revenue_df[enc.categories_[0]] = tag_ohe
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Unnamed: 0,tag,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,next_total_revenue,antique,art dealer,artist supply,bicycle,...,hobby,jewelry,motor,music,opticians,shoe,stationery,telecom,tent,watch
0,furniture,808,823,3.096499e+04,719,3.894029e+04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,cable,731,750,1.301437e+05,642,1.544679e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,watch,87,87,3.179560e+04,87,4.168321e+04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,music,107,107,3.055638e+05,107,2.900710e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,gift,2244,2370,5.891499e+05,1619,6.667664e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,opticians,3920,4285,5.961703e+05,2255,7.204531e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3949,books,33,33,3.339252e+04,33,3.991901e+04,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,shoe,5353,6027,5.944143e+06,2607,6.974851e+06,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3951,motor,45,45,1.084952e+05,45,1.762053e+05,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Linear Regression

In [7]:
features = ['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode']
X = np.array(revenue_df[features])
y = np.array(revenue_df['next_total_revenue'])
reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.9145274219290362

In [8]:
# missing values will not be included
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
fitted_model.score(X_test, y_test)

0.9095667118775665

In [10]:
print("Coefficients: %s" % str(fitted_model.coef_))
print("Intercept: %s" % str(fitted_model.intercept_))

Coefficients: [207.008583   -17.63035964   0.87878747  -5.15602086]
Intercept: -51190.97019904165


In [11]:
lr_predictions = fitted_model.predict(X_test)
lr_predictions

array([ 248805.65773905, 1158642.4918648 , 2416379.44579371, ...,
        326442.49368191,  230201.47068545,  -41666.86600863])

In [12]:
show_result = pd.DataFrame({'y_pred': lr_predictions, 'y_true': y_test})
show_result

Unnamed: 0,y_pred,y_true
0,2.488057e+05,9.365932e+04
1,1.158642e+06,1.310241e+06
2,2.416379e+06,2.035351e+06
3,2.603267e+05,3.458201e+05
4,9.907708e+05,1.140788e+06
...,...,...
1130,-1.792473e+04,3.554826e+03
1131,-2.875925e+04,2.420074e+04
1132,3.264425e+05,3.604291e+05
1133,2.302015e+05,2.691051e+05


In [13]:
print("r2 score: ", r2_score(lr_predictions, y_test))
print("Mean Absolute Error: ", mean_absolute_error(lr_predictions, y_test))

r2 score:  0.906468949785756
Mean Absolute Error:  180726.96669297802


#### Random Forest Regressor

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

rf = RandomForestRegressor(numTrees = 10, maxDepth = 10, labelCol='next_total_revenue')

model = rf.fit(train_df)
result = model.transform(test_df)

rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="mae", predictionCol='prediction')
mae = rf_evaluator.evaluate(result)
rf_evaluator = RegressionEvaluator(labelCol='next_total_revenue', metricName="r2", predictionCol='prediction')
r2 = rf_evaluator.evaluate(result)
print('+++++++++++++++++++++++++++++++++++++++++++')
print(f'Using Categorical feature: {features}')
print('mae:{}'.format(mae))
print('r2: {}'.format(r2))

result.select("prediction","next_total_revenue","features").show(20)

#### Gradient Boosting Tree

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'next_total_revenue', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'next_total_revenue', 'features').show(5)

gbt_evaluator = RegressionEvaluator(
    labelCol="next_total_revenue", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

#### Neural Network with full Features

In [14]:
revenue_df = revenue_df.drop(columns = 'tag')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(revenue_df.loc[:, revenue_df.columns != 'next_total_revenue'], revenue_df['next_total_revenue'], test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((2647, 28), (1135, 28))

In [16]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

((2647, 28), (1135, 28))

In [17]:
mlp_reg_full = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg_full.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.992043863165874


In [18]:
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_result.head()

Unnamed: 0,Actual,Predicted
565,93659.32,41003.08
553,1310241.0,1328190.0
2840,2035351.0,2047680.0
3503,345820.1,280253.9
808,1140788.0,1191294.0


In [19]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('r2 Score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 55124.22950620868
r2 Score: 0.9916511794090259


### Neural Network with selected Features

In [20]:
full

Unnamed: 0,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag,next_total_num_consumer,next_total_revenue,next_total_num_transaction
0,10023283211,808,209.025160,823,62506.642770,e,3.096499e+04,719,furniture,980.0,3.894029e+04,1002.0
1,10142254217,731,41.119659,750,62693.954667,b,1.301437e+05,642,cable,918.0,1.544679e+05,925.0
2,10187291046,87,111.084087,87,61060.045977,b,3.179560e+04,87,watch,99.0,4.168321e+04,100.0
3,10192359162,107,451.143208,107,65637.383178,a,3.055638e+05,107,music,107.0,2.900710e+05,107.0
4,10206519221,2244,39.209217,2370,62122.522785,a,5.891499e+05,1619,gift,2662.0,6.667664e+05,2811.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3948,99938978285,3920,30.917688,4285,62782.274912,b,5.961703e+05,2255,opticians,4614.0,7.204531e+05,5104.0
3949,99974311662,33,319.209608,33,60450.212121,b,3.339252e+04,33,books,40.0,3.991901e+04,40.0
3950,99976658299,5353,150.114508,6027,62576.340468,a,5.944143e+06,2607,shoe,6134.0,6.974851e+06,7063.0
3951,99987905597,45,353.519570,45,61502.444444,a,1.084952e+05,45,motor,69.0,1.762053e+05,69.0


In [21]:
revenue_df = full[['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode', 'next_total_revenue']].dropna()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(revenue_df.loc[:, revenue_df.columns != 'next_total_revenue'], revenue_df['next_total_revenue'], test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((2647, 4), (1135, 4))

In [23]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

((2647, 4), (1135, 4))

In [24]:
mlp_reg_selected = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg_selected.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.9878753168496073


In [25]:
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_result.head()

Unnamed: 0,Actual,Predicted
565,93659.32,166574.0
553,1310241.0,1250629.0
2840,2035351.0,1987217.0
3503,345820.1,283813.8
808,1140788.0,1159126.0


In [26]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('r2 Score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 66289.41007008284
r2 Score: 0.9874455676129257


## Prediction

In [27]:
revenue_df_full.isnull().sum()

tag                        0
total_num_consumer         0
total_num_transaction      0
total_revenue              0
total_num_postcode         0
next_total_revenue       171
antique                    0
art dealer                 0
artist supply              0
bicycle                    0
books                      0
cable                      0
computer                   0
digital goods              0
equipment                  0
florists                   0
furniture                  0
garden supply              0
gift                       0
health                     0
hobby                      0
jewelry                    0
motor                      0
music                      0
opticians                  0
shoe                       0
stationery                 0
telecom                    0
tent                       0
watch                      0
dtype: int64

In [28]:

# only the missing value needs to be predicted by the model (neuron network with full features)
train_pred = revenue_df_full.loc[revenue_df_full['next_total_revenue'].isnull()].drop(columns = ['tag', 'next_total_revenue'])
train_pred.shape

(171, 28)

In [29]:
# predict the missing value with pre-trained neural network model
revenue_pred = mlp_reg_full.predict(train_pred)
train_pred['next_total_revenue'] = revenue_pred.tolist()
train_pred



Unnamed: 0,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,antique,art dealer,artist supply,bicycle,books,cable,...,jewelry,motor,music,opticians,shoe,stationery,telecom,tent,watch,next_total_revenue
10,5,5,23758.806015,5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.709379e+10
16,1,1,56170.128286,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.038180e+10
23,8,8,571244.079843,8,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.106850e+11
29,1,1,32971.716744,1,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.370432e+10
48,4,4,265915.804518,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.911748e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3858,11,11,93883.360758,11,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.752297e+10
3864,27,27,461374.870800,27,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.317571e+11
3868,7,7,354552.824888,7,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.549046e+11
3916,8,8,441812.668866,8,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.176378e+11


In [30]:
full.isnull().sum()

merchant_abn                    0
total_num_consumer              0
avg_dollar_value                0
total_num_transaction           0
mean_income                     0
revenue_level                   0
total_revenue                   0
total_num_postcode              0
tag                             0
next_total_num_consumer         0
next_total_revenue            171
next_total_num_transaction    171
dtype: int64

In [32]:
# combine the missing value prediction with origin dataset
final_full = full.fillna(train_pred)
# check for existance of missing values
final_full.isnull().sum()

merchant_abn                    0
total_num_consumer              0
avg_dollar_value                0
total_num_transaction           0
mean_income                     0
revenue_level                   0
total_revenue                   0
total_num_postcode              0
tag                             0
next_total_num_consumer         0
next_total_revenue              0
next_total_num_transaction    171
dtype: int64

In [33]:
final_full.to_csv('../data/curated/full_consumer_revenue.csv')

In [34]:
final_full

Unnamed: 0,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag,next_total_num_consumer,next_total_revenue,next_total_num_transaction
0,10023283211,808,209.025160,823,62506.642770,e,3.096499e+04,719,furniture,980.0,3.894029e+04,1002.0
1,10142254217,731,41.119659,750,62693.954667,b,1.301437e+05,642,cable,918.0,1.544679e+05,925.0
2,10187291046,87,111.084087,87,61060.045977,b,3.179560e+04,87,watch,99.0,4.168321e+04,100.0
3,10192359162,107,451.143208,107,65637.383178,a,3.055638e+05,107,music,107.0,2.900710e+05,107.0
4,10206519221,2244,39.209217,2370,62122.522785,a,5.891499e+05,1619,gift,2662.0,6.667664e+05,2811.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3948,99938978285,3920,30.917688,4285,62782.274912,b,5.961703e+05,2255,opticians,4614.0,7.204531e+05,5104.0
3949,99974311662,33,319.209608,33,60450.212121,b,3.339252e+04,33,books,40.0,3.991901e+04,40.0
3950,99976658299,5353,150.114508,6027,62576.340468,a,5.944143e+06,2607,shoe,6134.0,6.974851e+06,7063.0
3951,99987905597,45,353.519570,45,61502.444444,a,1.084952e+05,45,motor,69.0,1.762053e+05,69.0
