# 7. Predict the merchant features 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# df = pd.read_parquet('../data/curated/transaction/transaction_wcf_after_selection.parquet')

## 7.1 Feature Engineering 

In [2]:
def create_features(df):
    ''' 
    Create features for the model
    :param df: the dataframe to process
    :type df: pandas dataframe
    :return: the processed dataframe
    '''
    # Group by 'merchant_abn' and aggregate the data
    aggregated_df = df.groupby('merchant_abn').agg(
        total_num_consumer=('user_id', pd.Series.nunique),
        avg_dollar_value=('dollar_value', 'mean'),
        revenue_level=('revenue_level', 'first'),
        total_revenue=('dollar_value', 'sum'),
        postcode=('postcode', 'first'),
        tag=('area', 'first')
    ).reset_index()
    
    return aggregated_df

In [3]:
def create_labels(df):
    ''' 
    Create labels for the model
    :param df: the dataframe to process
    :type df: pandas dataframe
    :return: the processed dataframe
    '''
    label = df.groupby('merchant_abn').agg(
        y_total_num_consumer=('user_id', 'nunique'),  # Count distinct consumer_id
        y_total_revenue=('dollar_value', lambda x: (x * df['take_rate']).sum()),  # Calculate y_total_revenue
        y_total_transaction = ('order_datetime', 'count'), # Count sum of order 
        y_average_dollar=('dollar_value', 'mean') # Calculate the average dollar
    ).reset_index()
    
    return label

In [4]:
# Filter 'df' to create the train_df DataFrame
train_df = df[(df['order_datetime'] >= '2021-02-28') & (df['order_datetime'] < '2021-08-28')]
label_df = df[(df['order_datetime'] >= '2022-02-28') & (df['order_datetime'] < '2022-08-28')]
test_df = df[(df['order_datetime'] >= '2021-02-28') & (df['order_datetime'] < '2022-08-28')]

In [5]:
train_data = create_features(train_df)
train_label = create_labels(label_df)
test_data = create_features(test_df)

In [6]:
train_data = train_data.merge(train_label, on = "merchant_abn", how="left")
train_data.to_parquet("../data/curated/train/train_features.parquet")
test_data.to_parquet("../data/curated/test/test_features.parquet")

## 7.2 Feature prediction model 

### 7.2.1 Total number of consumers --- Linear regression

In [7]:
train_data = pd.read_parquet("../data/curated/train/train_features.parquet")
test_data = pd.read_parquet("../data/curated/test/test_features.parquet")

In [8]:
train_data = train_data.dropna()
train_data = train_data.set_index('merchant_abn')
cat_features = ['tag','revenue_level']
train_data = pd.get_dummies(train_data, columns=cat_features)

In [9]:
label = ['y_total_num_consumer', 'y_total_revenue', 'y_total_transaction', 'y_average_dollar']
X = train_data.drop(columns=label, axis=1)  
y = train_data["y_total_num_consumer"]  

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("R-squared:", r2)

Mean Absolute Error: 54.67758194555557
R-squared: 0.9961100108691724


In [11]:
result_df = pd.DataFrame({'True Consumer Number': y_test, 'Predicted Consumer Number': y_pred})
result_df

Unnamed: 0_level_0,True Consumer Number,Predicted Consumer Number
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1
62135.501338,272.0,256.051530
52242.018594,1932.0,1791.494881
87285.933136,116.0,133.607089
79070.032106,6.0,-20.709294
50667.745027,303.0,355.686518
...,...,...
44313.899037,226.0,251.899821
10702.078694,126.0,215.432093
95037.235406,5.0,52.372627
26341.565445,41.0,64.385139


### Predict 2023 consumer number 

In [12]:
test_data = test_data.dropna()
test_data = test_data.set_index('merchant_abn')
test_data = pd.get_dummies(test_data, columns=cat_features)

In [13]:
model = LinearRegression()
model.fit(X, y)
nextyear_pred_consumer_num = model.predict(test_data)
# set every number smaller than 0 to 0
nextyear_pred_consumer_num[nextyear_pred_consumer_num < 0] = 0

In [14]:
# add merchant_abn to the result
nextyear_pred_consumer_num = pd.DataFrame(nextyear_pred_consumer_num, columns=['Predicted Consumer Number'])
nextyear_pred_consumer_num['merchant_abn'] = test_data.index
nextyear_pred_consumer_num = nextyear_pred_consumer_num[['merchant_abn', 'Predicted Consumer Number']]
nextyear_pred_consumer_num

Unnamed: 0,merchant_abn,Predicted Consumer Number
0,10023.283211,3018.559464
1,10142.254217,2820.403593
2,10187.291046,399.356656
3,10192.359162,389.869413
4,10206.519221,7883.547850
...,...,...
3677,99938.978286,12535.662071
3678,99974.311663,208.931115
3679,99976.658300,15504.729451
3680,99987.905598,207.972488


### 7.2.2 Model of total revenue --- Random forest regression

In [15]:
X = train_data.drop(columns=label, axis=1)  # Features (independent variables)
y = train_data["y_total_revenue"]  # Target variable (dependent variable)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Aboslute Error: {mae:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Mean Aboslute Error: 103687.21
R-squared (R2) Score: 0.97


In [16]:
result_df = pd.DataFrame({'True Revenue': y_test, 'Predicted Revenue': y_pred})
result_df

Unnamed: 0_level_0,True Revenue,Predicted Revenue
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1
62135.501338,8.774485e+05,856013.614654
52242.018594,7.836687e+05,704746.137469
87285.933136,2.036888e+04,23782.393738
79070.032106,5.440369e+05,333843.545115
50667.745027,5.258189e+05,589498.053432
...,...,...
44313.899037,1.052017e+06,949044.982389
10702.078694,7.541902e+04,92701.774562
95037.235406,3.956640e+04,52731.885551
26341.565445,1.314843e+05,127020.171359


### Predict 2023 total revenue

In [17]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)
nextyear_pred_revenue = rf.predict(test_data)
nextyear_pred_revenue[nextyear_pred_revenue < 0] = 0

In [18]:
# add merchant_abn to the result
nextyear_pred_revenue = pd.DataFrame(nextyear_pred_revenue, columns=['Predicted revenue'])
nextyear_pred_revenue['merchant_abn'] = test_data.index
nextyear_pred_revenue = nextyear_pred_revenue[['merchant_abn', 'Predicted revenue']]
nextyear_pred_revenue

Unnamed: 0,merchant_abn,Predicted revenue
0,10023.283211,1.533194e+06
1,10142.254217,4.741837e+05
2,10187.291046,1.842250e+05
3,10192.359162,1.134313e+06
4,10206.519221,2.393028e+06
...,...,...
3677,99938.978286,2.377972e+06
3678,99974.311663,1.649007e+05
3679,99976.658300,1.900765e+07
3680,99987.905598,4.272877e+05


### 7.2.3 Model of total transaction --- XGB boost

In [19]:
X = train_data.drop(columns=label, axis=1)  # Features (independent variables)
y = train_data["y_total_transaction"]  # Target variable (dependent variable)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a DMatrix for training and testing
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define hyperparameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # Use regression for the objective function
    'eval_metric': 'rmse',            # Evaluation metric (Root Mean Squared Error)
    'learning_rate': 0.1,             # Learning rate
    'max_depth': 6,                   # Maximum depth of trees
    'n_estimators': 100               # Number of boosting rounds
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=params['n_estimators'])

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Absolute Error (RMSE): {mae:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")


Parameters: { "n_estimators" } are not used.

Root Mean Absolute Error (RMSE): 35.82
R-squared (R2) Score: 1.00


### Predict 2023 total transaction amount

In [20]:
# make a dmatrix for test_data
dtest = xgb.DMatrix(test_data)
nextyear_pred_tran = model.predict(dtest)
nextyear_pred_tran[nextyear_pred_tran < 0] = 0

In [21]:
# add merchant_abn to the result
nextyear_pred_tran = pd.DataFrame(nextyear_pred_tran, columns=['Predicted Transaction Number'])
nextyear_pred_tran['merchant_abn'] = test_data.index
nextyear_pred_tran = nextyear_pred_tran[['merchant_abn', 'Predicted Transaction Number']]
nextyear_pred_tran

Unnamed: 0,merchant_abn,Predicted Transaction Number
0,10023.283211,3429.025635
1,10142.254217,3191.772461
2,10187.291046,346.453949
3,10192.359162,398.589020
4,10206.519221,9768.631836
...,...,...
3677,99938.978286,19136.537109
3678,99974.311663,138.545303
3679,99976.658300,24123.587891
3680,99987.905598,188.055267


### 7.2.4 Model of average dollar value --- Random forest regression

In [22]:
X = train_data.drop(columns=label, axis=1)  # Features (independent variables)
y = train_data["y_average_dollar"]  # Target variable (dependent variable)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Aboslute Error: {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Mean Aboslute Error: 175.46
R-squared (R2) Score: 0.88


In [23]:
result_df = pd.DataFrame({'True avg dollar': y_test, 'Predicted avg dollar': y_pred})
result_df

Unnamed: 0_level_0,True avg dollar,Predicted avg dollar
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1
62135.501338,510.429381,516.879734
52242.018594,87.571020,88.679514
87285.933136,105.145990,97.125175
79070.032106,13140.987889,11717.166856
50667.745027,470.018342,475.962336
...,...,...
44313.899037,1036.735898,1090.545266
10702.078694,99.806813,87.410721
95037.235406,2082.442025,3177.803742
26341.565445,544.470860,524.883814


### Predict 2023 average dollar value

In [24]:
model.fit(X, y)
nextyear_pred_avg_dollar = model.predict(test_data)
nextyear_pred_avg_dollar[nextyear_pred_avg_dollar < 0] = 0

In [25]:
# add merchant_abn to the result
nextyear_pred_avg_dollar = pd.DataFrame(nextyear_pred_avg_dollar, columns=['Predicted Avg Dollar'])
nextyear_pred_avg_dollar['merchant_abn'] = test_data.index
nextyear_pred_avg_dollar = nextyear_pred_avg_dollar[['merchant_abn', 'Predicted Avg Dollar']]

nextyear_pred_avg_dollar

Unnamed: 0,merchant_abn,Predicted Avg Dollar
0,10023.283211,215.981013
1,10142.254217,38.716586
2,10187.291046,116.774819
3,10192.359162,479.917474
4,10206.519221,37.213071
...,...,...
3677,99938.978286,30.161008
3678,99974.311663,306.845980
3679,99976.658300,150.635490
3680,99987.905598,363.406096


In [26]:
# file saving for further prediction
nextyear_pred_avg_dollar.to_csv("../data/predicted/nextyear_pred_avg_dollar.csv")
nextyear_pred_tran.to_csv("../data/predicted/nextyear_pred_tran.csv")
nextyear_pred_revenue.to_csv("../data/predicted/nextyear_pred_revenue.csv")
nextyear_pred_consumer_num.to_csv("../data/predicted/nextyear_pred_consumer_num.csv")