In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [68]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

#import lightgbm as lgb

In [3]:
pd.options.display.max_columns = 100

In [4]:
df = pd.read_parquet('../data/curated/merchant_consumer_abs')
df

Unnamed: 0,postcode,total_earners,median_age,income_sum,income_median,income_mean,2021_population,km2,persons/km2,merchant_name,revenue_level,user_id,order_datetime,products,take_rate,category,dollar_value,order_year,order_month,order_day,consumer,consumer_address,consumer_state,consumer_postcode,consumer_gender,fraud_group
0,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Elit Sed Consequat Associates,a,10413,2022-04-27,artist supply and craft shops,5.89,art and gifts,375.167732,2022,4,27,Kelly Clayton,4211 Rodney Tunnel Suite 525,NT,0800,Female,0
1,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Elit Sed Consequat Associates,a,22246,2022-10-07,artist supply and craft shops,5.89,art and gifts,617.879131,2022,10,7,Corey Estrada,1703 Boyd Shore,NT,0800,Male,0
2,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Elit Sed Consequat Associates,a,2681,2022-02-28,artist supply and craft shops,5.89,art and gifts,766.507707,2022,2,28,Nicole Bishop,467 Robert Islands Apt. 834,NT,0800,Female,0
3,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Malesuada Vel Ltd,b,13454,2021-07-24,books periodicals and newspapers,3.56,books and music,258.865959,2021,7,24,Natalie Herrera,88798 Saunders Hills Apt. 945,NT,0800,Female,0
4,0800,5631.999974,33.0,4.206090e+08,57789.000082,74682.000125,7678.999968,3.2,2419.999994,Varius Orci Institute,a,10146,2021-08-08,tent and awning shops,6.30,outdoors,4.159038,2021,8,8,Shannon Mann,00817 Owens Circle,NT,0800,Female,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11087920,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Ipsum Primis Associates,a,7056,2021-07-17,watch clock and jewelry repair shops,6.94,fashion and accessories,56.702387,2021,7,17,Shannon Petty,386 Aaron Manors,TAS,7470,Female,0
11087921,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Ipsum Primis Associates,a,18868,2022-03-11,watch clock and jewelry repair shops,6.94,fashion and accessories,84.885059,2022,3,11,Michael Bush,47924 Jessica Drive Apt. 344,TAS,7470,Male,0
11087922,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Lorem Foundation,a,2243,2021-05-12,digital goods books movies music,6.55,books and music,41.906740,2021,5,12,Kevin Brown,1861 Tina Junction Suite 680,TAS,7470,Undisclosed,0
11087923,7470,1985.000000,45.0,1.157059e+08,49789.000000,58290.000000,4373.000000,3931.6,1.100000,Lorem Foundation,a,2243,2022-01-06,digital goods books movies music,6.55,books and music,9.540977,2022,1,6,Kevin Brown,1861 Tina Junction Suite 680,TAS,7470,Undisclosed,0


### get total transactions and revenue by each merchant, with their take rate

In [5]:
df_agg = df.groupby(by=['merchant_name']).agg(
    total_transaction=('postcode', 'count'),
    total_revenue=('dollar_value', np.sum),
    take_rate=('take_rate', np.max)
    ).reset_index()


In [6]:
df_agg

Unnamed: 0,merchant_name,total_transaction,total_revenue,take_rate
0,A Aliquet Ltd,923,195789.626317,3.87
1,A Arcu Industries,2272,834906.529502,3.00
2,A Arcu Sed Company,1155,150267.314855,2.38
3,A Arcu Sed Corporation,18,10786.692787,3.06
4,A Associates,68,24638.615042,4.95
...,...,...,...,...
3888,Vulputate Ullamcorper Magna LLC,89,33455.109989,2.51
3889,Vulputate Ullamcorper Magna Ltd,74,26325.078990,6.18
3890,Vulputate Velit Eu Incorporated,44,20997.614845,4.76
3891,Vulputate Velit Eu Limited,11871,229909.434371,3.09


### select columns that will be used in the model

In [7]:
df2 = df.groupby(by=['merchant_name']).agg(
    total_earners=('total_earners', np.mean),
    median_age=('median_age', np.mean),
    income_median=('income_median', np.mean),
    population_density=('persons/km2', np.mean),
    category=('category', np.max)
    ).reset_index()
df2

Unnamed: 0,merchant_name,total_earners,median_age,income_median,population_density,category
0,A Aliquet Ltd,4862.803994,43.731275,46483.421039,586.478990,home and technology
1,A Arcu Industries,4842.337178,43.029102,46000.305374,581.978543,outdoors
2,A Arcu Sed Company,5004.380529,43.549603,46960.156182,621.150344,fashion and accessories
3,A Arcu Sed Corporation,4731.045332,40.962152,47077.062398,508.682908,books and music
4,A Associates,4948.447635,44.993200,43272.166605,374.105159,fashion and accessories
...,...,...,...,...,...,...
3888,Vulputate Ullamcorper Magna LLC,5327.727326,41.833457,45620.722953,553.874939,outdoors
3889,Vulputate Ullamcorper Magna Ltd,4604.493285,42.786520,46107.888997,525.632756,home and technology
3890,Vulputate Velit Eu Incorporated,4648.454131,42.355864,45383.783363,457.262243,fashion and accessories
3891,Vulputate Velit Eu Limited,4904.114009,43.258805,46115.231565,570.892114,outdoors


In [8]:
df3 = df.groupby('merchant_name')['consumer_state'].nunique()
df3.to_frame()
df3.reset_index()

Unnamed: 0,merchant_name,consumer_state
0,A Aliquet Ltd,8
1,A Arcu Industries,8
2,A Arcu Sed Company,8
3,A Arcu Sed Corporation,7
4,A Associates,7
...,...,...
3888,Vulputate Ullamcorper Magna LLC,6
3889,Vulputate Ullamcorper Magna Ltd,7
3890,Vulputate Velit Eu Incorporated,7
3891,Vulputate Velit Eu Limited,8


In [9]:
df2 = df2.merge(df3, how='inner', on=['merchant_name'])
df2

Unnamed: 0,merchant_name,total_earners,median_age,income_median,population_density,category,consumer_state
0,A Aliquet Ltd,4862.803994,43.731275,46483.421039,586.478990,home and technology,8
1,A Arcu Industries,4842.337178,43.029102,46000.305374,581.978543,outdoors,8
2,A Arcu Sed Company,5004.380529,43.549603,46960.156182,621.150344,fashion and accessories,8
3,A Arcu Sed Corporation,4731.045332,40.962152,47077.062398,508.682908,books and music,7
4,A Associates,4948.447635,44.993200,43272.166605,374.105159,fashion and accessories,7
...,...,...,...,...,...,...,...
3888,Vulputate Ullamcorper Magna LLC,5327.727326,41.833457,45620.722953,553.874939,outdoors,6
3889,Vulputate Ullamcorper Magna Ltd,4604.493285,42.786520,46107.888997,525.632756,home and technology,7
3890,Vulputate Velit Eu Incorporated,4648.454131,42.355864,45383.783363,457.262243,fashion and accessories,7
3891,Vulputate Velit Eu Limited,4904.114009,43.258805,46115.231565,570.892114,outdoors,8


### calculate transaction_per_day and revenue (taken) per day

In [10]:
TIME_SPAN = int(365*1.5)
df_agg['transaction_per_day'] = df_agg['total_transaction'] / TIME_SPAN
df_agg['revenue_per_day'] = df_agg['total_revenue'] / TIME_SPAN
df_agg['revenue_taken'] = df_agg['revenue_per_day'] * (df_agg['take_rate'] / 100)
df_agg

Unnamed: 0,merchant_name,total_transaction,total_revenue,take_rate,transaction_per_day,revenue_per_day,revenue_taken
0,A Aliquet Ltd,923,195789.626317,3.87,1.687386,357.933503,13.852027
1,A Arcu Industries,2272,834906.529502,3.00,4.153565,1526.337348,45.790120
2,A Arcu Sed Company,1155,150267.314855,2.38,2.111517,274.711727,6.538139
3,A Arcu Sed Corporation,18,10786.692787,3.06,0.032907,19.719731,0.603424
4,A Associates,68,24638.615042,4.95,0.124314,45.043172,2.229637
...,...,...,...,...,...,...,...
3888,Vulputate Ullamcorper Magna LLC,89,33455.109989,2.51,0.162706,61.161079,1.535143
3889,Vulputate Ullamcorper Magna Ltd,74,26325.078990,6.18,0.135283,48.126287,2.974205
3890,Vulputate Velit Eu Incorporated,44,20997.614845,4.76,0.080439,38.386864,1.827215
3891,Vulputate Velit Eu Limited,11871,229909.434371,3.09,21.702011,420.309752,12.987571


In [11]:
df_agg = df_agg.drop(columns=['total_transaction', 'total_revenue', 'take_rate', 'revenue_per_day'])

In [12]:
df_final = df_agg.merge(df2, on='merchant_name', how='inner')
df_final

Unnamed: 0,merchant_name,transaction_per_day,revenue_taken,total_earners,median_age,income_median,population_density,category,consumer_state
0,A Aliquet Ltd,1.687386,13.852027,4862.803994,43.731275,46483.421039,586.478990,home and technology,8
1,A Arcu Industries,4.153565,45.790120,4842.337178,43.029102,46000.305374,581.978543,outdoors,8
2,A Arcu Sed Company,2.111517,6.538139,5004.380529,43.549603,46960.156182,621.150344,fashion and accessories,8
3,A Arcu Sed Corporation,0.032907,0.603424,4731.045332,40.962152,47077.062398,508.682908,books and music,7
4,A Associates,0.124314,2.229637,4948.447635,44.993200,43272.166605,374.105159,fashion and accessories,7
...,...,...,...,...,...,...,...,...,...
3888,Vulputate Ullamcorper Magna LLC,0.162706,1.535143,5327.727326,41.833457,45620.722953,553.874939,outdoors,6
3889,Vulputate Ullamcorper Magna Ltd,0.135283,2.974205,4604.493285,42.786520,46107.888997,525.632756,home and technology,7
3890,Vulputate Velit Eu Incorporated,0.080439,1.827215,4648.454131,42.355864,45383.783363,457.262243,fashion and accessories,7
3891,Vulputate Velit Eu Limited,21.702011,12.987571,4904.114009,43.258805,46115.231565,570.892114,outdoors,8


In [13]:
#df_final = df_final.drop(columns=['merchant_name'])
df_final['category'] = df_final['category'].astype('object')
df_final['consumer_state'] = df_final['consumer_state'].astype('object')
df_final

Unnamed: 0,merchant_name,transaction_per_day,revenue_taken,total_earners,median_age,income_median,population_density,category,consumer_state
0,A Aliquet Ltd,1.687386,13.852027,4862.803994,43.731275,46483.421039,586.478990,home and technology,8
1,A Arcu Industries,4.153565,45.790120,4842.337178,43.029102,46000.305374,581.978543,outdoors,8
2,A Arcu Sed Company,2.111517,6.538139,5004.380529,43.549603,46960.156182,621.150344,fashion and accessories,8
3,A Arcu Sed Corporation,0.032907,0.603424,4731.045332,40.962152,47077.062398,508.682908,books and music,7
4,A Associates,0.124314,2.229637,4948.447635,44.993200,43272.166605,374.105159,fashion and accessories,7
...,...,...,...,...,...,...,...,...,...
3888,Vulputate Ullamcorper Magna LLC,0.162706,1.535143,5327.727326,41.833457,45620.722953,553.874939,outdoors,6
3889,Vulputate Ullamcorper Magna Ltd,0.135283,2.974205,4604.493285,42.786520,46107.888997,525.632756,home and technology,7
3890,Vulputate Velit Eu Incorporated,0.080439,1.827215,4648.454131,42.355864,45383.783363,457.262243,fashion and accessories,7
3891,Vulputate Velit Eu Limited,21.702011,12.987571,4904.114009,43.258805,46115.231565,570.892114,outdoors,8


In [14]:
df_final.dtypes

merchant_name           object
transaction_per_day    float64
revenue_taken          float64
total_earners          float64
median_age             float64
income_median          float64
population_density     float64
category                object
consumer_state          object
dtype: object

In [15]:
df_final = pd.get_dummies(df_final)

In [96]:
df_final.to_csv('../data/curated/transaction & revenue.csv', encoding='utf-8', index=False)

In [None]:
# transaction数量和revenue的权重暂定1:2
# df_final['score'] = df_final['transaction_per_day']


In [105]:
xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
r2= np.mean(np.sqrt(-cross_val_score(xgb, df_final.loc[:, df_final.columns!='revenue_taken'], df_final['revenue_taken'], scoring="r2", cv = 5)))
print(r2)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be 

KeyboardInterrupt: 

In [102]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

In [69]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005))
rmse= np.mean(np.sqrt(-cross_val_score(lasso, df_final.loc[:, df_final.columns!='revenue_taken'], df_final['revenue_taken'], scoring="neg_mean_squared_error", cv = 5)))
print(rmse)

57.59250500309882


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [70]:
lr = make_pipeline(RobustScaler(), LinearRegression())
rmse= np.mean(np.sqrt(-cross_val_score(lr, df_final.loc[:, df_final.columns!='revenue_taken'], df_final['revenue_taken'], scoring="neg_mean_squared_error", cv = 5)))
print(rmse)

57.59256592455597


22/10/03 06:27:05 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 961604 ms exceeds timeout 120000 ms
22/10/03 06:27:05 WARN SparkContext: Killing executors is not supported by current scheduler.
