In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPRegressor
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

### Read in Data

In [None]:
train_df = pd.read_parquet('../data/curated/train_data/')
test_df = pd.read_parquet('../data/curated/test_data/')
train_df.dtypes

### Inspect Correlation
- categorical variables: tag, revenue level  
use anova test to examine the significance of these variables in predicting each target variable

- continuous variables  
calculate the pearson correlation of each pair of continuous variables

In [None]:
model = ols('y_total_num_consumer ~ C(tag) + C(revenue_level)', data=train_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

In [None]:
model = ols('y_total_revenue ~ C(tag) + C(revenue_level)', data=train_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

In [None]:
model = ols('y_total_num_transaction ~ C(tag) + C(revenue_level)', data=train_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

#### Observation
"tag" is significant in all models, whereas "revenue level" is not a significant feature in predicting total number of consumers and transactions

In [None]:
# pearson correlation heatmap for continuous variables
x_axis = ["Total num Consumer", "Average Dollar Value", "Total num Transaction", "Mean Income", "Total Revenue", "Total num Postcode", "Next Total num Consumer", "Next Total Revenue", "Next Total num Transaction"]
plt.figure(figsize = (10, 10))
sns.heatmap(train_df.drop('merchant_abn', axis=1).corr(), annot = True, annot_kws={"size": 12}, xticklabels=x_axis, yticklabels=x_axis)
plt.title('Pearson Correlation Metric', size = 20)
plt.savefig("../plots/Pearson Correlation Metric.png", bbox_inches = "tight")

#### Observation
"Mean income" and "avergate dollar value" have little correlation with target variables. Therefore, they can be excluded in models.

### Data Preprocessing

In [None]:
# drop the columns not needed
train_df = train_df[['merchant_abn', 'total_num_consumer', 'tag', 'total_revenue', 'revenue_level', 'total_num_postcode', 
                     'total_num_transaction', 'y_total_num_consumer', 'y_total_revenue', 'y_total_num_transaction']].dropna()
train_df = train_df.set_index('merchant_abn')

test_df = test_df[['merchant_abn', 'total_num_consumer', 'tag', 'total_num_postcode', 'total_revenue',
                   'revenue_level', 'total_num_transaction']].dropna()
test_df = test_df.set_index('merchant_abn')

In [None]:
# change tags and revenue level into numeric features using one hot encoding
cat_features = ["tag", "revenue_level"]
train_df = pd.get_dummies(train_df, columns = cat_features)
test_df = pd.get_dummies(test_df, columns = cat_features)

train_df.head()

## Models for Total Number of Consumers

### Linear Regression

In [None]:
labels = ['y_total_num_consumer', 'y_total_revenue', 'y_total_num_transaction']
# select useful features (exclude revenue_level)
features = [i for i in train_df.columns if i not in labels and not i.startswith('revenue')]
X = train_df[features]
y = train_df['y_total_num_consumer']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
print("Intercept: %s" % str(fitted_model.intercept_))
coef = pd.DataFrame({"feature":features,"coefficient":lr.coef_})
coef.head(10)

In [None]:
y_pred = fitted_model.predict(X_test)
result = pd.DataFrame({'predicted_total_num_consumer': y_pred, 'true_total_num_consumer': y_test})
result

In [None]:
result.loc[result['predicted_total_num_consumer']<=0]

In [None]:
print("r2 score:", r2_score(y_pred, y_test))
print("Mean Absolute Error:", mean_absolute_error(y_pred, y_test))

In [None]:
# mean evaluation metrics of 10 fold CV
scores = cross_validate(lr, X, y, cv=10, scoring=('r2', 'neg_mean_absolute_error'), return_train_score=True)
print("train r2 score: %0.4f" % scores['train_r2'].mean())
print("test r2 score: %0.4f" % scores['test_r2'].mean())
print("train mean absolute error:: %0.4f" % -scores['train_neg_mean_absolute_error'].mean())
print("test mean absolute error:: %0.4f" % -scores['test_neg_mean_absolute_error'].mean())

### Neural Network

In [None]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

In [None]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred = mlp_reg.predict(X_testscaled)

In [None]:
df_result = pd.DataFrame({'y_pred': y_pred, 'y_true': y_test})
df_result

In [None]:
print('r2 score:', r2_score(y_pred, y_test))
print('mean absolute error:', mean_absolute_error(y_pred, y_test))

## Models for Total Number of Transactions


### Linear Regression

In [None]:
labels = ['y_total_num_consumer', 'y_total_revenue', 'y_total_num_transaction']
# select useful features (exclude revenue_level)
features = [i for i in train_df.columns if i not in labels and not i.startswith('revenue')]
X = train_df[features]
y = train_df['y_total_num_transaction']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
print("Intercept: %s" % str(fitted_model.intercept_))
coef = pd.DataFrame({"feature":features,"coefficient":lr.coef_})
coef

In [None]:
y_pred = fitted_model.predict(X_test)
result = pd.DataFrame({'predicted_total_num_transaction': y_pred, 'true_total_num_transaction': y_test})
result

In [None]:
result.loc[result['predicted_total_num_transaction']<=0]

In [None]:
print("r2 score:", r2_score(y_pred, y_test))
print("Mean Absolute Error:", mean_absolute_error(y_pred, y_test))

In [None]:
# mean evaluation metrics of 10 fold CV
scores = cross_validate(lr, X, y, cv=10, scoring=('r2', 'neg_mean_absolute_error'), return_train_score=True)
print("train r2 score: %0.4f" % scores['train_r2'].mean())
print("test r2 score: %0.4f" % scores['test_r2'].mean())
print("train mean absolute error:: %0.4f" % -scores['train_neg_mean_absolute_error'].mean())
print("test mean absolute error:: %0.4f" % -scores['test_neg_mean_absolute_error'].mean())

### Neural Network

In [None]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)

mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=30034, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred = mlp_reg.predict(X_testscaled)
print('r2 score:', r2_score(y_pred, y_test))
print('mean absolute error:', mean_absolute_error(y_pred, y_test))

df_result = pd.DataFrame({'predicted_total_num_transaction': y_pred, 'true_total_num_transaction': y_test})
df_result


## Model for BNPL Revenue

### Linear Regression

In [None]:
# select useful features 
features = [i for i in train_df.columns if i not in labels]
X = train_df[features]
y = train_df['y_total_revenue']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
print("Intercept: %s" % str(fitted_model.intercept_))
coef = pd.DataFrame({"feature":features,"coefficient":lr.coef_})
coef.head(10)


In [None]:
y_pred = fitted_model.predict(X_test)
result = pd.DataFrame({'predicted_total_revenue': y_pred, 'true_total_revenue': y_test})
result

In [None]:
print("r2 score:", r2_score(y_pred, y_test))
print("Mean Absolute Error:", mean_absolute_error(y_pred, y_test))

In [None]:
# mean evaluation metrics of 10 fold CV
scores = cross_validate(lr, X, y, cv=10, scoring=('r2', 'neg_mean_absolute_error'), return_train_score=True)
print("train r2 score: %0.4f" % scores['train_r2'].mean())
print("test r2 score: %0.4f" % scores['test_r2'].mean())
print("train mean absolute error:: %0.4f" % -scores['train_neg_mean_absolute_error'].mean())
print("test mean absolute error:: %0.4f" % -scores['test_neg_mean_absolute_error'].mean())

### Neural Network

In [None]:
# select useful features 
features = [i for i in train_df.columns if i not in labels]
X = train_df[features]
y = train_df['y_total_revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# scale train and test dataset in order to be standard normally distributed with zero mean
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)
X_trainscaled.shape, X_testscaled.shape

In [None]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(128,128,128,128),activation="relu" ,solver = 'adam', random_state=0, max_iter=20000)\
    .fit(X_trainscaled, y_train)
y_pred = mlp_reg.predict(X_testscaled)

In [None]:
df_result = pd.DataFrame({'predicted_total_revenue': y_pred, 'true_total_revenue': y_test})
df_result

In [None]:
print('r2 score:', r2_score(y_pred, y_test))
print('mean absolute error:', mean_absolute_error(y_pred, y_test))

## Final Model
For predicting **number of consumers and transactions** next year, Linear Regression and Neural Network produce similar results. **Linear regression** is chosen as the final model since it has better interpretability and requires less time to run the model. 

However, when predicting **total revenue**, we choose **Neural Network** as it shows a better performance with increased r2 score and decreased mean absolute error.
