In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder.appName("ADS project 2")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/02 20:12:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [71]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [65]:
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [68]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
#import lightgbm as lgb

In [11]:
pd.options.display.max_columns = 100

In [97]:
df = pd.read_parquet('../data/curated/merchant_consumer_abs')
df

FileNotFoundError: [Errno 2] No such file or directory: '../data/curated/merchant_consumer_abs'

### get total transactions and revenue by each merchant, with their take rate

In [73]:
df_agg = df.groupby(by=['merchant_name']).agg(
    total_transaction=('postcode', 'count'),
    total_revenue=('dollar_value', np.sum),
    take_rate=('take_rate', np.max)
    ).reset_index()


In [74]:
df_agg

Unnamed: 0,merchant_name,total_transaction,total_revenue,take_rate
0,A Aliquet Ltd,923,195789.626317,3.87
1,A Arcu Industries,2272,834906.529502,3.00
2,A Arcu Sed Company,1155,150267.314855,2.38
3,A Arcu Sed Corporation,18,10786.692787,3.06
4,A Associates,68,24638.615042,4.95
...,...,...,...,...
3886,Vulputate Ullamcorper Magna LLC,89,33455.109989,2.51
3887,Vulputate Ullamcorper Magna Ltd,74,26325.078990,6.18
3888,Vulputate Velit Eu Incorporated,44,20997.614845,4.76
3889,Vulputate Velit Eu Limited,11871,229909.434371,3.09


### select columns that will be used in the model

In [75]:
df2 = df.groupby(by=['merchant_name']).agg(
    total_earners=('total_earners', np.mean),
    median_age=('median_age', np.mean),
    income_median=('income_median', np.mean),
    population_density=('persons/km2', np.mean),
    category=('category', np.max)
    ).reset_index()
df2

Unnamed: 0,merchant_name,total_earners,median_age,income_median,population_density,category
0,A Aliquet Ltd,4862.803994,43.731275,46483.421039,586.478990,2
1,A Arcu Industries,4842.337178,43.029102,46000.305374,581.978543,0
2,A Arcu Sed Company,5004.380529,43.549603,46960.156182,621.150344,2
3,A Arcu Sed Corporation,4731.045332,40.962152,47077.062398,508.682908,0
4,A Associates,4948.447635,44.993200,43272.166605,374.105159,2
...,...,...,...,...,...,...
3886,Vulputate Ullamcorper Magna LLC,5327.727326,41.833457,45620.722953,553.874939,0
3887,Vulputate Ullamcorper Magna Ltd,4604.493285,42.786520,46107.888997,525.632756,2
3888,Vulputate Velit Eu Incorporated,4648.454131,42.355864,45383.783363,457.262243,2
3889,Vulputate Velit Eu Limited,4904.114009,43.258805,46115.231565,570.892114,2


In [76]:
df3 = df.groupby('merchant_name')['consumer_state'].nunique()
df3.to_frame()
df3.reset_index()

Unnamed: 0,merchant_name,consumer_state
0,A Aliquet Ltd,8
1,A Arcu Industries,8
2,A Arcu Sed Company,8
3,A Arcu Sed Corporation,7
4,A Associates,7
...,...,...
3886,Vulputate Ullamcorper Magna LLC,6
3887,Vulputate Ullamcorper Magna Ltd,7
3888,Vulputate Velit Eu Incorporated,7
3889,Vulputate Velit Eu Limited,8


In [77]:
df2 = df2.merge(df3, how='inner', on=['merchant_name'])
df2

Unnamed: 0,merchant_name,total_earners,median_age,income_median,population_density,category,consumer_state
0,A Aliquet Ltd,4862.803994,43.731275,46483.421039,586.478990,2,8
1,A Arcu Industries,4842.337178,43.029102,46000.305374,581.978543,0,8
2,A Arcu Sed Company,5004.380529,43.549603,46960.156182,621.150344,2,8
3,A Arcu Sed Corporation,4731.045332,40.962152,47077.062398,508.682908,0,7
4,A Associates,4948.447635,44.993200,43272.166605,374.105159,2,7
...,...,...,...,...,...,...,...
3886,Vulputate Ullamcorper Magna LLC,5327.727326,41.833457,45620.722953,553.874939,0,6
3887,Vulputate Ullamcorper Magna Ltd,4604.493285,42.786520,46107.888997,525.632756,2,7
3888,Vulputate Velit Eu Incorporated,4648.454131,42.355864,45383.783363,457.262243,2,7
3889,Vulputate Velit Eu Limited,4904.114009,43.258805,46115.231565,570.892114,2,8


### calculate transaction_per_day and revenue (taken) per day

In [78]:
TIME_SPAN = int(365*1.5)
df_agg['transaction_per_day'] = df_agg['total_transaction'] / TIME_SPAN
df_agg['revenue_per_day'] = df_agg['total_revenue'] / TIME_SPAN
df_agg['revenue_taken'] = df_agg['revenue_per_day'] * (df_agg['take_rate'] / 100)
df_agg

Unnamed: 0,merchant_name,total_transaction,total_revenue,take_rate,transaction_per_day,revenue_per_day,revenue_taken
0,A Aliquet Ltd,923,195789.626317,3.87,1.687386,357.933503,13.852027
1,A Arcu Industries,2272,834906.529502,3.00,4.153565,1526.337348,45.790120
2,A Arcu Sed Company,1155,150267.314855,2.38,2.111517,274.711727,6.538139
3,A Arcu Sed Corporation,18,10786.692787,3.06,0.032907,19.719731,0.603424
4,A Associates,68,24638.615042,4.95,0.124314,45.043172,2.229637
...,...,...,...,...,...,...,...
3886,Vulputate Ullamcorper Magna LLC,89,33455.109989,2.51,0.162706,61.161079,1.535143
3887,Vulputate Ullamcorper Magna Ltd,74,26325.078990,6.18,0.135283,48.126287,2.974205
3888,Vulputate Velit Eu Incorporated,44,20997.614845,4.76,0.080439,38.386864,1.827215
3889,Vulputate Velit Eu Limited,11871,229909.434371,3.09,21.702011,420.309752,12.987571


In [79]:
df_agg = df_agg.drop(columns=['total_transaction', 'total_revenue', 'take_rate', 'revenue_per_day'])

In [90]:
df_final = df_agg.merge(df2, on='merchant_name', how='inner')
df_final

Unnamed: 0,merchant_name,transaction_per_day,revenue_taken,total_earners,median_age,income_median,population_density,category,consumer_state
0,A Aliquet Ltd,1.687386,13.852027,4862.803994,43.731275,46483.421039,586.478990,2,8
1,A Arcu Industries,4.153565,45.790120,4842.337178,43.029102,46000.305374,581.978543,0,8
2,A Arcu Sed Company,2.111517,6.538139,5004.380529,43.549603,46960.156182,621.150344,2,8
3,A Arcu Sed Corporation,0.032907,0.603424,4731.045332,40.962152,47077.062398,508.682908,0,7
4,A Associates,0.124314,2.229637,4948.447635,44.993200,43272.166605,374.105159,2,7
...,...,...,...,...,...,...,...,...,...
3886,Vulputate Ullamcorper Magna LLC,0.162706,1.535143,5327.727326,41.833457,45620.722953,553.874939,0,6
3887,Vulputate Ullamcorper Magna Ltd,0.135283,2.974205,4604.493285,42.786520,46107.888997,525.632756,2,7
3888,Vulputate Velit Eu Incorporated,0.080439,1.827215,4648.454131,42.355864,45383.783363,457.262243,2,7
3889,Vulputate Velit Eu Limited,21.702011,12.987571,4904.114009,43.258805,46115.231565,570.892114,2,8


In [86]:
#df_final = df_final.drop(columns=['merchant_name'])
df_final['category'] = df_final['category'].astype('object')
df_final['consumer_state'] = df_final['consumer_state'].astype('object')
df_final

Unnamed: 0,merchant_name,transaction_per_day,revenue_taken,total_earners,median_age,income_median,population_density,category,consumer_state
0,A Aliquet Ltd,1.687386,13.852027,4862.803994,43.731275,46483.421039,586.478990,2,8
1,A Arcu Industries,4.153565,45.790120,4842.337178,43.029102,46000.305374,581.978543,0,8
2,A Arcu Sed Company,2.111517,6.538139,5004.380529,43.549603,46960.156182,621.150344,2,8
3,A Arcu Sed Corporation,0.032907,0.603424,4731.045332,40.962152,47077.062398,508.682908,0,7
4,A Associates,0.124314,2.229637,4948.447635,44.993200,43272.166605,374.105159,2,7
...,...,...,...,...,...,...,...,...,...
3886,Vulputate Ullamcorper Magna LLC,0.162706,1.535143,5327.727326,41.833457,45620.722953,553.874939,0,6
3887,Vulputate Ullamcorper Magna Ltd,0.135283,2.974205,4604.493285,42.786520,46107.888997,525.632756,2,7
3888,Vulputate Velit Eu Incorporated,0.080439,1.827215,4648.454131,42.355864,45383.783363,457.262243,2,7
3889,Vulputate Velit Eu Limited,21.702011,12.987571,4904.114009,43.258805,46115.231565,570.892114,2,8


In [91]:
df_final.dtypes

merchant_name           object
transaction_per_day    float64
revenue_taken          float64
total_earners          float64
median_age             float64
income_median          float64
population_density     float64
category                 int32
consumer_state           int64
dtype: object

In [88]:
df_final = pd.get_dummies(df_final)

In [96]:
df_final.to_csv('../data/curated/transaction & revenue.csv', encoding='utf-8', index=False)

In [None]:
# transaction数量和revenue的权重暂定1:2
# df_final['score'] = df_final['transaction_per_day']


In [66]:
xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
rmse= np.mean(np.sqrt(-cross_val_score(xgb, df_final.loc[:, df_final.columns!='revenue_taken'], df_final['revenue_taken'], scoring="neg_mean_squared_error", cv = 5)))
print(rmse)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be 

In [69]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005))
rmse= np.mean(np.sqrt(-cross_val_score(lasso, df_final.loc[:, df_final.columns!='revenue_taken'], df_final['revenue_taken'], scoring="neg_mean_squared_error", cv = 5)))
print(rmse)

57.59250500309882


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [70]:
lr = make_pipeline(RobustScaler(), LinearRegression())
rmse= np.mean(np.sqrt(-cross_val_score(lr, df_final.loc[:, df_final.columns!='revenue_taken'], df_final['revenue_taken'], scoring="neg_mean_squared_error", cv = 5)))
print(rmse)

57.59256592455597


22/10/03 06:27:05 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 961604 ms exceeds timeout 120000 ms
22/10/03 06:27:05 WARN SparkContext: Killing executors is not supported by current scheduler.
