# Prediction Model for BNPL next Transaction

In [27]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

## Data Preprocessing

In [28]:
full = pd.read_csv('../data/curated/full_consumer_revenue.csv').drop(columns = 'Unnamed: 0')
full = full.rename({'y_total_num_consumer': 'next_total_num_consumer', 'y_total_revenue': 'next_total_revenue', 'y_total_num_transaction': 'next_total_num_transaction'}, axis = 1)
full.dtypes

merchant_abn                    int64
total_num_consumer              int64
avg_dollar_value              float64
total_num_transaction           int64
mean_income                   float64
revenue_level                  object
total_revenue                 float64
total_num_postcode              int64
tag                            object
next_total_num_consumer       float64
next_total_revenue            float64
next_total_num_transaction    float64
dtype: object

## Model for BNPL number of transactions
Features:
    total number of consumer, average dollar value, total number of transaction, mean income, total number of postcode, tags

label:
    next year revenue

In [29]:
# drop the columns not needed
transaction_df = full[['tag', 'total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode', 'next_total_num_transaction']].dropna()
transaction_df

Unnamed: 0,tag,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,next_total_num_transaction
0,furniture,808,823,3.096499e+04,719,1002.0
1,cable,731,750,1.301437e+05,642,925.0
2,watch,87,87,3.179560e+04,87,100.0
3,music,107,107,3.055638e+05,107,107.0
4,gift,2244,2370,5.891499e+05,1619,2811.0
...,...,...,...,...,...,...
3948,opticians,3920,4285,5.961703e+05,2255,5104.0
3949,books,33,33,3.339252e+04,33,40.0
3950,shoe,5353,6027,5.944143e+06,2607,7063.0
3951,motor,45,45,1.084952e+05,45,69.0


### Indexing and One-hot Encoding

In [30]:
# change tags into numeric feature by one hot encoding
enc = OneHotEncoder(handle_unknown='ignore')
tag_ohe = enc.fit_transform([[i] for i in transaction_df['tag']]).toarray()
tag_ohe

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
transaction_df[enc.categories_[0]] = tag_ohe
transaction_df = transaction_df.dropna()
transaction_df

Unnamed: 0,tag,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,next_total_num_transaction,antique,art dealer,artist supply,bicycle,...,hobby,jewelry,motor,music,opticians,shoe,stationery,telecom,tent,watch
0,furniture,808,823,3.096499e+04,719,1002.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,cable,731,750,1.301437e+05,642,925.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,watch,87,87,3.179560e+04,87,100.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,music,107,107,3.055638e+05,107,107.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,gift,2244,2370,5.891499e+05,1619,2811.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,opticians,3920,4285,5.961703e+05,2255,5104.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3949,books,33,33,3.339252e+04,33,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,shoe,5353,6027,5.944143e+06,2607,7063.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3951,motor,45,45,1.084952e+05,45,69.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Linear Regression

In [32]:
features = ['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode']
X = np.array(transaction_df[features])
y = np.array(transaction_df['next_total_num_transaction'])
reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.9998127982453174

In [33]:
# missing values will not be included
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [34]:
lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
fitted_model.score(X_test, y_test)

0.9997298258340519

In [35]:
print("Coefficients: %s" % str(fitted_model.coef_))
print("Intercept: %s" % str(fitted_model.intercept_))

Coefficients: [ 1.97266437e-02  1.20750661e+00 -2.64365541e-05 -2.03838625e-02]
Intercept: -3.28566357822433


In [36]:
lr_predictions = fitted_model.predict(X_test)
lr_predictions

array([  48.32964744, 1713.54310498, 7286.87564828, ...,  746.65730519,
        457.87485922,   12.18891042])

In [37]:
show_result = pd.DataFrame({'y_pred': lr_predictions, 'y_true': y_test})
show_result

Unnamed: 0,y_pred,y_true
0,48.329647,25.0
1,1713.543105,1777.0
2,7286.875648,7147.0
3,620.970058,704.0
4,1135.334996,1115.0
...,...,...
1130,-1.861629,1.0
1131,25.136916,30.0
1132,746.657305,773.0
1133,457.874859,446.0


In [38]:
print("r2 score: ", r2_score(lr_predictions, y_test))
print("Mean Absolute Error: ", mean_absolute_error(lr_predictions, y_test))

r2 score:  0.999731590399934
Mean Absolute Error:  32.1389277591878


#### Neural Network with full Features

In [39]:
transaction_df = transaction_df.drop(columns = 'tag')

In [40]:
X_train, X_test, y_train, y_test = train_test_split(transaction_df.loc[:, transaction_df.columns != 'next_total_num_transaction'], transaction_df['next_total_num_transaction'], test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((2647, 28), (1135, 28))

In [41]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

((2647, 28), (1135, 28))

In [42]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.9991749525599471


In [43]:
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_result.head()

Unnamed: 0,Actual,Predicted
565,25.0,9.697376
553,1777.0,1679.496871
2840,7147.0,7218.387155
3503,704.0,590.080906
808,1115.0,1132.048999


In [44]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('r2 Score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 42.335526350111
r2 Score: 0.9991571419404487


### Neural Network with selected Features

In [45]:
full

Unnamed: 0,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag,next_total_num_consumer,next_total_revenue,next_total_num_transaction
0,10023283211,808,209.025160,823,62506.642770,e,3.096499e+04,719,furniture,980.0,3.894029e+04,1002.0
1,10142254217,731,41.119659,750,62693.954667,b,1.301437e+05,642,cable,918.0,1.544679e+05,925.0
2,10187291046,87,111.084087,87,61060.045977,b,3.179560e+04,87,watch,99.0,4.168321e+04,100.0
3,10192359162,107,451.143208,107,65637.383178,a,3.055638e+05,107,music,107.0,2.900710e+05,107.0
4,10206519221,2244,39.209217,2370,62122.522785,a,5.891499e+05,1619,gift,2662.0,6.667664e+05,2811.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3948,99938978285,3920,30.917688,4285,62782.274912,b,5.961703e+05,2255,opticians,4614.0,7.204531e+05,5104.0
3949,99974311662,33,319.209608,33,60450.212121,b,3.339252e+04,33,books,40.0,3.991901e+04,40.0
3950,99976658299,5353,150.114508,6027,62576.340468,a,5.944143e+06,2607,shoe,6134.0,6.974851e+06,7063.0
3951,99987905597,45,353.519570,45,61502.444444,a,1.084952e+05,45,motor,69.0,1.762053e+05,69.0


In [46]:
transaction_df = full[['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode', 'next_total_num_transaction']].dropna()

In [47]:
X_train, X_test, y_train, y_test = train_test_split(transaction_df.loc[:, transaction_df.columns != 'next_total_num_transaction'], transaction_df['next_total_num_transaction'], test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((2647, 4), (1135, 4))

In [48]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

((2647, 4), (1135, 4))

In [49]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred=mlp_reg.predict(X_testscaled)
print("The Score with ", (metrics.r2_score(y_pred, y_test)))

The Score with  0.9997470423955768


In [50]:
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_result.head()

Unnamed: 0,Actual,Predicted
565,25.0,17.179347
553,1777.0,1735.684878
2840,7147.0,7245.423368
3503,704.0,620.670424
808,1115.0,1153.675284


In [51]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('r2 Score:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 32.14781163771716
r2 Score: 0.9997442001029064


## Prediction

In [52]:
full_pred = full[['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode', 'next_total_num_transaction']]
# only the missing value needs to be predicted by the model
train_pred = full_pred.loc[full_pred['next_total_num_transaction'].isnull()][['total_num_consumer', 'total_num_transaction', 'total_revenue', 'total_num_postcode']]
train_pred.shape

(171, 4)

In [53]:
# predict the missing value with pre-trained neural network modelrevenue_pred = mlp_reg.predict(train_pred)
train_pred['next_total_num_transaction'] = revenue_pred.tolist()
train_pred



Unnamed: 0,total_num_consumer,total_num_transaction,total_revenue,total_num_postcode,next_total_num_transaction
10,5,5,23758.806015,5,1.780649e+06
16,1,1,56170.128286,1,4.169002e+06
23,8,8,571244.079843,8,4.239002e+07
29,1,1,32971.716744,1,2.448754e+06
48,4,4,265915.804518,4,1.973373e+07
...,...,...,...,...,...
3858,11,11,93883.360758,11,7.003246e+06
3864,27,27,461374.870800,27,3.431438e+07
3868,7,7,354552.824888,7,2.631779e+07
3916,8,8,441812.668866,8,3.279220e+07


In [54]:
full.isnull().sum()

merchant_abn                    0
total_num_consumer              0
avg_dollar_value                0
total_num_transaction           0
mean_income                     0
revenue_level                   0
total_revenue                   0
total_num_postcode              0
tag                             0
next_total_num_consumer         0
next_total_revenue              0
next_total_num_transaction    171
dtype: int64

In [55]:
# combine the missing value prediction with origin dataset
final_full = full.fillna(train_pred)

# check for existance of missing values
final_full.isnull().sum()

merchant_abn                  0
total_num_consumer            0
avg_dollar_value              0
total_num_transaction         0
mean_income                   0
revenue_level                 0
total_revenue                 0
total_num_postcode            0
tag                           0
next_total_num_consumer       0
next_total_revenue            0
next_total_num_transaction    0
dtype: int64

In [56]:
final_full.to_csv('../data/curated/full_no_missing.csv')

In [60]:
final_full

Unnamed: 0,merchant_abn,total_num_consumer,avg_dollar_value,total_num_transaction,mean_income,revenue_level,total_revenue,total_num_postcode,tag,next_total_num_consumer,next_total_revenue,next_total_num_transaction
0,10023283211,808,209.025160,823,62506.642770,e,3.096499e+04,719,furniture,980.0,3.894029e+04,1002.0
1,10142254217,731,41.119659,750,62693.954667,b,1.301437e+05,642,cable,918.0,1.544679e+05,925.0
2,10187291046,87,111.084087,87,61060.045977,b,3.179560e+04,87,watch,99.0,4.168321e+04,100.0
3,10192359162,107,451.143208,107,65637.383178,a,3.055638e+05,107,music,107.0,2.900710e+05,107.0
4,10206519221,2244,39.209217,2370,62122.522785,a,5.891499e+05,1619,gift,2662.0,6.667664e+05,2811.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3948,99938978285,3920,30.917688,4285,62782.274912,b,5.961703e+05,2255,opticians,4614.0,7.204531e+05,5104.0
3949,99974311662,33,319.209608,33,60450.212121,b,3.339252e+04,33,books,40.0,3.991901e+04,40.0
3950,99976658299,5353,150.114508,6027,62576.340468,a,5.944143e+06,2607,shoe,6134.0,6.974851e+06,7063.0
3951,99987905597,45,353.519570,45,61502.444444,a,1.084952e+05,45,motor,69.0,1.762053e+05,69.0
