In [1]:
import sys
sys.path.append('../scripts/')
from read_utils import read_file, create_folder, temp_record_query, temp_record_sdf

In [2]:
import pandas as pd
import numpy as np

### read data

In [3]:
data = pd.read_parquet('../data/curated/data/')

In [4]:
data['order_datetime'] = data['order_datetime'].astype('str')
data = data[data['order_datetime']<='2022-08-28']

In [5]:
data = data.dropna(subset=['New cases / day'])
data['New cases / day'] = data['New cases / day'].apply(lambda x: float(x.replace(',', '')))
data['postcode'] = data['postcode'].astype('int')
data['total'] = data['total'].astype('float64')
# data['merchant_abn'] = data['merchant_abn'].astype('float64')

In [6]:
data = data.sort_values(['merchant_abn', 'order_datetime'])
grouped = data.groupby('merchant_abn')

In [7]:
genders = list(set(list(data.gender)))
states = list(set(list(data.state)))
print(genders)
print(states)

['Female', 'Undisclosed', 'Male']
['SA', 'NSW', 'TAS', 'WA', 'QLD', 'ACT', 'NT', 'VIC']


In [8]:
data.head()

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,take_rate,type,postcode,consumer_id,...,state,gender,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime
4255,10285,10023283211,311.872869,e41d1fb7-d7b9-4758-9c20-1d34337a312d,Felis Limited,"furniture, home furnishings and equipment shop...",0.18,e,6479,191378,...,WA,Female,6479,243.959797,118.396935,-30.651794,5.0,2021-02-01,30332.5,2021-02-28
5409,714,10023283211,279.618513,2bfa1772-2c93-4aaa-8a32-7e393f2c4267,Felis Limited,"furniture, home furnishings and equipment shop...",0.18,e,3555,642522,...,VIC,Female,3555,21059.999858,144.242651,-36.806383,5.0,2021-02-01,30332.5,2021-02-28
13719,5568,10023283211,110.075256,078410a3-9595-44ba-8714-2effd0e91231,Felis Limited,"furniture, home furnishings and equipment shop...",0.18,e,2154,501304,...,NSW,Female,2154,41084.20779,150.99988,-33.727887,5.0,2021-02-01,30332.5,2021-02-28
24515,10144,10023283211,195.744544,6a9f5c53-b014-4d98-bb5b-90d06263acc3,Felis Limited,"furniture, home furnishings and equipment shop...",0.18,e,5577,474931,...,SA,Female,5577,1774.54261,137.245303,-35.065603,8.0,2021-03-01,30725.9,2021-03-01
25417,23597,10023283211,189.251521,1f73f73e-09c3-484a-a724-c67e32db65fe,Felis Limited,"furniture, home furnishings and equipment shop...",0.18,e,3660,1314310,...,VIC,Male,3660,8067.650238,145.264868,-37.056397,8.0,2021-03-01,30725.9,2021-03-01


In [9]:
all_features = []
for abn, df in grouped:
    feature_list = [abn]
    
    # dollar value
    feature_list.append(np.mean(df.dollar_value))
    feature_list.append(np.min(df.dollar_value))
    feature_list.append(np.max(df.dollar_value))

    # ncd
    feature_list.append(np.mean(df['New cases / day']))

    # gender
    temp = len(df)
    feature_list.extend(len(df[df.gender == g])/temp for g in genders)
    
    # state
    feature_list.extend(len(df[df.state == s])/temp for s in states)
    
    # timeline
    feature_list.append((pd.Timestamp(list(df.iloc[-1:]['order_datetime'])[0]) - pd.Timestamp(list(df.iloc[:1]['order_datetime'])[0])).days)

    # print(feature_list)
    all_features.append(feature_list)

In [10]:
new_data = pd.DataFrame(all_features, columns=['merchant_abn', 'dollar_mean', 'dollar_min', 'dollar_max', 'ncd_mean', \
    genders[0]+'_count', genders[1]+'_count', genders[2]+'_count', \
    states[0]+'_count', states[1]+'_count', states[2]+'_count', states[3]+'_count', states[4]+'_count', states[5]+'_count', \
    states[6]+'_count', states[7]+'_count', 'time_span'])

In [11]:
new_data

Unnamed: 0,merchant_abn,dollar_mean,dollar_min,dollar_max,ncd_mean,Female_count,Undisclosed_count,Male_count,SA_count,NSW_count,TAS_count,WA_count,QLD_count,ACT_count,NT_count,VIC_count,time_span
0,10023283211,215.907357,4.016398,1306.809868,17671.272822,0.448084,0.101045,0.450871,0.106969,0.293380,0.035192,0.159582,0.148084,0.008362,0.014634,0.233798,546
1,10142254217,38.887890,0.010086,302.116185,17868.966892,0.430399,0.104590,0.465011,0.103461,0.281415,0.041761,0.162152,0.142212,0.009782,0.016178,0.243040,546
2,10165489824,12785.268849,4050.093665,20636.897020,16378.000000,0.250000,0.500000,0.250000,0.250000,0.250000,0.000000,0.250000,0.000000,0.000000,0.000000,0.250000,249
3,10187291046,118.394949,1.007069,544.979881,20445.269360,0.451178,0.114478,0.434343,0.141414,0.259259,0.043771,0.161616,0.161616,0.006734,0.010101,0.215488,540
4,10192359162,459.328629,10.364227,2314.116256,16838.854103,0.431611,0.085106,0.483283,0.112462,0.279635,0.033435,0.164134,0.161094,0.009119,0.009119,0.231003,544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4417,99974311662,286.284404,19.023977,1216.210828,16049.191304,0.469565,0.147826,0.382609,0.113043,0.295652,0.034783,0.121739,0.173913,0.008696,0.000000,0.252174,538
4418,99976658299,150.076506,3.078433,671.566546,17621.841037,0.443165,0.098736,0.458099,0.113524,0.287618,0.035482,0.157938,0.149885,0.009859,0.015716,0.229977,546
4419,99987905597,357.227525,57.274540,1676.478636,21472.900000,0.443750,0.075000,0.481250,0.162500,0.275000,0.037500,0.143750,0.150000,0.006250,0.012500,0.212500,534
4420,99989036621,56552.394443,56552.394443,56552.394443,2103.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0


### merchant data for train y

In [12]:
merchants = pd.read_parquet('../data/curated/merchants_data.parquet')

In [13]:
merchants.head()

Unnamed: 0,merchant_name,tags,merchant_abn,take_rate,type
0,Felis Limited,"furniture, home furnishings and equipment shop...",10023283211,0.18,e
1,Arcu Ac Orci Corporation,"cable, satellite, and other pay television and...",10142254217,4.22,b
2,Nunc Sed Company,"jewelry, watch, clock, and silverware shops",10165489824,4.4,b
3,Ultricies Dignissim Lacus Foundation,"watch, clock, and jewelry repair shops",10187291046,3.29,b
4,Enim Condimentum PC,"music shops - musical instruments, pianos, and...",10192359162,6.33,a


In [14]:
merchants = merchants[['merchant_abn', 'tags', 'take_rate']]
merchants.head()

Unnamed: 0,merchant_abn,tags,take_rate
0,10023283211,"furniture, home furnishings and equipment shop...",0.18
1,10142254217,"cable, satellite, and other pay television and...",4.22
2,10165489824,"jewelry, watch, clock, and silverware shops",4.4
3,10187291046,"watch, clock, and jewelry repair shops",3.29
4,10192359162,"music shops - musical instruments, pianos, and...",6.33


In [15]:
merchants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4026 entries, 0 to 4025
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   merchant_abn  4026 non-null   int64  
 1   tags          4026 non-null   object 
 2   take_rate     4026 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 94.5+ KB


In [16]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
encoder = LabelEncoder()  
tags = encoder.fit_transform(merchants['tags'].values)  
tags = np.array([tags]).T
merchants = pd.concat([merchants, pd.DataFrame(tags)], axis=1)

In [17]:
merchants.head()

Unnamed: 0,merchant_abn,tags,take_rate,0
0,10023283211,"furniture, home furnishings and equipment shop...",0.18,11
1,10142254217,"cable, satellite, and other pay television and...",4.22,5
2,10165489824,"jewelry, watch, clock, and silverware shops",4.4,15
3,10187291046,"watch, clock, and jewelry repair shops",3.29,24
4,10192359162,"music shops - musical instruments, pianos, and...",6.33,18


In [18]:
merchants = merchants.rename({merchants.columns[-1]:'tag_int'}, axis=1)

- tag prediction - classification

In [19]:
dataset = pd.merge(new_data, merchants[['merchant_abn','tag_int']], 'left', 'merchant_abn')

In [20]:
dataset.dropna(subset=['tag_int'])

Unnamed: 0,merchant_abn,dollar_mean,dollar_min,dollar_max,ncd_mean,Female_count,Undisclosed_count,Male_count,SA_count,NSW_count,TAS_count,WA_count,QLD_count,ACT_count,NT_count,VIC_count,time_span,tag_int
0,10023283211,215.907357,4.016398,1306.809868,17671.272822,0.448084,0.101045,0.450871,0.106969,0.293380,0.035192,0.159582,0.148084,0.008362,0.014634,0.233798,546,11.0
1,10142254217,38.887890,0.010086,302.116185,17868.966892,0.430399,0.104590,0.465011,0.103461,0.281415,0.041761,0.162152,0.142212,0.009782,0.016178,0.243040,546,5.0
2,10165489824,12785.268849,4050.093665,20636.897020,16378.000000,0.250000,0.500000,0.250000,0.250000,0.250000,0.000000,0.250000,0.000000,0.000000,0.000000,0.250000,249,15.0
3,10187291046,118.394949,1.007069,544.979881,20445.269360,0.451178,0.114478,0.434343,0.141414,0.259259,0.043771,0.161616,0.161616,0.006734,0.010101,0.215488,540,24.0
4,10192359162,459.328629,10.364227,2314.116256,16838.854103,0.431611,0.085106,0.483283,0.112462,0.279635,0.033435,0.164134,0.161094,0.009119,0.009119,0.231003,544,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4416,99938978285,30.010095,0.005036,332.645347,17703.350356,0.449190,0.101815,0.448995,0.113804,0.290538,0.036034,0.155606,0.148347,0.009786,0.014582,0.231303,546,19.0
4417,99974311662,286.284404,19.023977,1216.210828,16049.191304,0.469565,0.147826,0.382609,0.113043,0.295652,0.034783,0.121739,0.173913,0.008696,0.000000,0.252174,538,4.0
4418,99976658299,150.076506,3.078433,671.566546,17621.841037,0.443165,0.098736,0.458099,0.113524,0.287618,0.035482,0.157938,0.149885,0.009859,0.015716,0.229977,546,20.0
4419,99987905597,357.227525,57.274540,1676.478636,21472.900000,0.443750,0.075000,0.481250,0.162500,0.275000,0.037500,0.143750,0.150000,0.006250,0.012500,0.212500,534,17.0


In [21]:
train_tag = dataset.dropna(subset=['tag_int'])
test_tag = dataset[dataset.tag_int.isna()]

In [22]:
train_tag.tag_int = train_tag.tag_int.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [23]:
train_tag.dtypes

merchant_abn           int64
dollar_mean          float64
dollar_min           float64
dollar_max           float64
ncd_mean             float64
Female_count         float64
Undisclosed_count    float64
Male_count           float64
SA_count             float64
NSW_count            float64
TAS_count            float64
WA_count             float64
QLD_count            float64
ACT_count            float64
NT_count             float64
VIC_count            float64
time_span              int64
tag_int                int64
dtype: object

In [24]:
X_train_tag = train_tag.loc[:, :'time_span']

In [25]:
X_train_tag.head(5)

Unnamed: 0,merchant_abn,dollar_mean,dollar_min,dollar_max,ncd_mean,Female_count,Undisclosed_count,Male_count,SA_count,NSW_count,TAS_count,WA_count,QLD_count,ACT_count,NT_count,VIC_count,time_span
0,10023283211,215.907357,4.016398,1306.809868,17671.272822,0.448084,0.101045,0.450871,0.106969,0.29338,0.035192,0.159582,0.148084,0.008362,0.014634,0.233798,546
1,10142254217,38.88789,0.010086,302.116185,17868.966892,0.430399,0.10459,0.465011,0.103461,0.281415,0.041761,0.162152,0.142212,0.009782,0.016178,0.24304,546
2,10165489824,12785.268849,4050.093665,20636.89702,16378.0,0.25,0.5,0.25,0.25,0.25,0.0,0.25,0.0,0.0,0.0,0.25,249
3,10187291046,118.394949,1.007069,544.979881,20445.26936,0.451178,0.114478,0.434343,0.141414,0.259259,0.043771,0.161616,0.161616,0.006734,0.010101,0.215488,540
4,10192359162,459.328629,10.364227,2314.116256,16838.854103,0.431611,0.085106,0.483283,0.112462,0.279635,0.033435,0.164134,0.161094,0.009119,0.009119,0.231003,544


In [26]:
Y_train_tag = train_tag['tag_int']

In [27]:
X_test_tag = test_tag.loc[:, :'time_span']

In [28]:
from sklearn.naive_bayes import MultinomialNB

In [29]:
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(MultinomialNB(alpha=0.1))
multi_model = model.fit(X_train_tag, Y_train_tag)
y_pred_tag = multi_model.predict(X_test_tag)

In [30]:
test_tag['tag_int'] = y_pred_tag

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
test_tag.head()

Unnamed: 0,merchant_abn,dollar_mean,dollar_min,dollar_max,ncd_mean,Female_count,Undisclosed_count,Male_count,SA_count,NSW_count,TAS_count,WA_count,QLD_count,ACT_count,NT_count,VIC_count,time_span,tag_int
29,10672322914,3929.548403,791.291712,9666.91563,17160.92233,0.466019,0.135922,0.398058,0.126214,0.31068,0.038835,0.135922,0.135922,0.019417,0.009709,0.223301,545,22
47,11029341140,2535.748329,281.078519,5755.891571,7239.714286,0.571429,0.0,0.428571,0.142857,0.285714,0.142857,0.142857,0.071429,0.0,0.071429,0.142857,510,22
53,11137507330,114.993262,0.309387,894.480908,18284.652199,0.444318,0.099943,0.45574,0.115363,0.291262,0.035979,0.154198,0.146202,0.007424,0.011993,0.237579,546,21
60,11240426404,127.984243,0.02404,1408.151712,18054.871882,0.442844,0.104082,0.453074,0.110344,0.288457,0.036933,0.160422,0.148724,0.009383,0.013806,0.231931,546,18
65,11358147682,11972.309236,1785.170832,37268.557739,20327.285714,0.402597,0.103896,0.493506,0.103896,0.25974,0.012987,0.168831,0.12987,0.0,0.025974,0.298701,532,22


- rate prediction (linear_prediction)

In [32]:
rate_data = pd.merge(new_data, merchants[['merchant_abn','take_rate']], 'left', 'merchant_abn')

In [33]:
rate_data.dropna(subset=['take_rate'])

Unnamed: 0,merchant_abn,dollar_mean,dollar_min,dollar_max,ncd_mean,Female_count,Undisclosed_count,Male_count,SA_count,NSW_count,TAS_count,WA_count,QLD_count,ACT_count,NT_count,VIC_count,time_span,take_rate
0,10023283211,215.907357,4.016398,1306.809868,17671.272822,0.448084,0.101045,0.450871,0.106969,0.293380,0.035192,0.159582,0.148084,0.008362,0.014634,0.233798,546,0.18
1,10142254217,38.887890,0.010086,302.116185,17868.966892,0.430399,0.104590,0.465011,0.103461,0.281415,0.041761,0.162152,0.142212,0.009782,0.016178,0.243040,546,4.22
2,10165489824,12785.268849,4050.093665,20636.897020,16378.000000,0.250000,0.500000,0.250000,0.250000,0.250000,0.000000,0.250000,0.000000,0.000000,0.000000,0.250000,249,4.40
3,10187291046,118.394949,1.007069,544.979881,20445.269360,0.451178,0.114478,0.434343,0.141414,0.259259,0.043771,0.161616,0.161616,0.006734,0.010101,0.215488,540,3.29
4,10192359162,459.328629,10.364227,2314.116256,16838.854103,0.431611,0.085106,0.483283,0.112462,0.279635,0.033435,0.164134,0.161094,0.009119,0.009119,0.231003,544,6.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4416,99938978285,30.010095,0.005036,332.645347,17703.350356,0.449190,0.101815,0.448995,0.113804,0.290538,0.036034,0.155606,0.148347,0.009786,0.014582,0.231303,546,4.50
4417,99974311662,286.284404,19.023977,1216.210828,16049.191304,0.469565,0.147826,0.382609,0.113043,0.295652,0.034783,0.121739,0.173913,0.008696,0.000000,0.252174,538,3.17
4418,99976658299,150.076506,3.078433,671.566546,17621.841037,0.443165,0.098736,0.458099,0.113524,0.287618,0.035482,0.157938,0.149885,0.009859,0.015716,0.229977,546,6.57
4419,99987905597,357.227525,57.274540,1676.478636,21472.900000,0.443750,0.075000,0.481250,0.162500,0.275000,0.037500,0.143750,0.150000,0.006250,0.012500,0.212500,534,6.82


In [34]:
train_rate = rate_data.dropna(subset=['take_rate'])
test_rate = rate_data[rate_data.take_rate.isna()]

In [35]:
X_train = train_rate.loc[:, :'time_span']
Y_train = train_rate['take_rate']
X_test = test_rate.loc[:, :'time_span']

In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
linreg = LinearRegression()

In [38]:
linreg.fit(X_train, Y_train)

LinearRegression()

In [39]:
y_pred = linreg.predict(X_test)

In [40]:
test_rate['take_rate'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [41]:
test_rate.head()

Unnamed: 0,merchant_abn,dollar_mean,dollar_min,dollar_max,ncd_mean,Female_count,Undisclosed_count,Male_count,SA_count,NSW_count,TAS_count,WA_count,QLD_count,ACT_count,NT_count,VIC_count,time_span,take_rate
29,10672322914,3929.548403,791.291712,9666.91563,17160.92233,0.466019,0.135922,0.398058,0.126214,0.31068,0.038835,0.135922,0.135922,0.019417,0.009709,0.223301,545,4.32363
47,11029341140,2535.748329,281.078519,5755.891571,7239.714286,0.571429,0.0,0.428571,0.142857,0.285714,0.142857,0.142857,0.071429,0.0,0.071429,0.142857,510,4.368066
53,11137507330,114.993262,0.309387,894.480908,18284.652199,0.444318,0.099943,0.45574,0.115363,0.291262,0.035979,0.154198,0.146202,0.007424,0.011993,0.237579,546,4.491005
60,11240426404,127.984243,0.02404,1408.151712,18054.871882,0.442844,0.104082,0.453074,0.110344,0.288457,0.036933,0.160422,0.148724,0.009383,0.013806,0.231931,546,4.497794
65,11358147682,11972.309236,1785.170832,37268.557739,20327.285714,0.402597,0.103896,0.493506,0.103896,0.25974,0.012987,0.168831,0.12987,0.0,0.025974,0.298701,532,4.012999


### write to file

In [42]:
with_merchant = data.dropna(subset=['merchant_name'])
null_merchant = data[data.merchant_name.isna()]

In [43]:
rate = test_rate[['merchant_abn', 'take_rate']]

In [44]:
tag = test_tag[['merchant_abn', 'tag_int']]

In [45]:
prediction = pd.merge(rate, tag, on='merchant_abn')
prediction.head()

Unnamed: 0,merchant_abn,take_rate,tag_int
0,10672322914,4.32363,22
1,11029341140,4.368066,22
2,11137507330,4.491005,21
3,11240426404,4.497794,18
4,11358147682,4.012999,22


In [46]:
null_merchant_pred = pd.merge(null_merchant, prediction, on='merchant_abn')

In [47]:
null_merchant_pred.head()

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,take_rate_x,type,postcode,consumer_id,...,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime,take_rate_y,tag_int
0,17815,10672322914,4059.498999,650b8c72-152b-4751-bf5f-05a924fd84a6,,,,,5345,309325,...,5345,1572.312894,140.451037,-34.28755,8.0,2021-03-01,30725.9,2021-03-01,4.32363,22
1,13558,10672322914,5448.248789,2894ea31-5bc6-46c2-8b54-2e8c76c8f8de,,,,,6845,1204030,...,6000,15045.030572,115.860904,-31.950215,8.0,2021-03-01,30725.9,2021-03-01,4.32363,22
2,9379,10672322914,1862.598116,a2e2c739-35a1-443f-a8ec-f65441c06aec,,,,,2486,452795,...,2486,40170.829131,153.473766,-28.217273,8.0,2021-03-01,30725.9,2021-03-02,4.32363,22
3,20111,10672322914,6875.771616,5b6189f9-9973-4808-b0de-6ba6039a94f4,,,,,4214,1261357,...,4214,39314.000023,153.365393,-27.961722,13.0,2021-03-01,30725.9,2021-03-05,4.32363,22
4,20111,10672322914,6875.771616,5b6189f9-9973-4808-b0de-6ba6039a94f4,,,,,4214,1261357,...,4214,39314.000023,153.365393,-27.961722,13.0,2021-03-01,30725.9,2021-03-05,4.32363,22


In [48]:
null_merchant_pred['tags'] = encoder.inverse_transform(null_merchant_pred['tag_int'].values)
null_merchant_pred.head()

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,take_rate_x,type,postcode,consumer_id,...,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime,take_rate_y,tag_int
0,17815,10672322914,4059.498999,650b8c72-152b-4751-bf5f-05a924fd84a6,,telecom,,,5345,309325,...,5345,1572.312894,140.451037,-34.28755,8.0,2021-03-01,30725.9,2021-03-01,4.32363,22
1,13558,10672322914,5448.248789,2894ea31-5bc6-46c2-8b54-2e8c76c8f8de,,telecom,,,6845,1204030,...,6000,15045.030572,115.860904,-31.950215,8.0,2021-03-01,30725.9,2021-03-01,4.32363,22
2,9379,10672322914,1862.598116,a2e2c739-35a1-443f-a8ec-f65441c06aec,,telecom,,,2486,452795,...,2486,40170.829131,153.473766,-28.217273,8.0,2021-03-01,30725.9,2021-03-02,4.32363,22
3,20111,10672322914,6875.771616,5b6189f9-9973-4808-b0de-6ba6039a94f4,,telecom,,,4214,1261357,...,4214,39314.000023,153.365393,-27.961722,13.0,2021-03-01,30725.9,2021-03-05,4.32363,22
4,20111,10672322914,6875.771616,5b6189f9-9973-4808-b0de-6ba6039a94f4,,telecom,,,4214,1261357,...,4214,39314.000023,153.365393,-27.961722,13.0,2021-03-01,30725.9,2021-03-05,4.32363,22


In [49]:
null_merchant_pred['take_rate'] = null_merchant_pred['take_rate_y']
null_merchant_pred = null_merchant_pred.drop(['take_rate_y', 'take_rate_x', 'tag_int'], axis=1)

In [50]:
null_merchant_pred.head()

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,type,postcode,consumer_id,name,...,gender,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime,take_rate
0,17815,10672322914,4059.498999,650b8c72-152b-4751-bf5f-05a924fd84a6,,telecom,,5345,309325,Thomas Ballard,...,Male,5345,1572.312894,140.451037,-34.28755,8.0,2021-03-01,30725.9,2021-03-01,4.32363
1,13558,10672322914,5448.248789,2894ea31-5bc6-46c2-8b54-2e8c76c8f8de,,telecom,,6845,1204030,Jennifer Grimes,...,Female,6000,15045.030572,115.860904,-31.950215,8.0,2021-03-01,30725.9,2021-03-01,4.32363
2,9379,10672322914,1862.598116,a2e2c739-35a1-443f-a8ec-f65441c06aec,,telecom,,2486,452795,Felicia Green,...,Female,2486,40170.829131,153.473766,-28.217273,8.0,2021-03-01,30725.9,2021-03-02,4.32363
3,20111,10672322914,6875.771616,5b6189f9-9973-4808-b0de-6ba6039a94f4,,telecom,,4214,1261357,Amy Jones,...,Female,4214,39314.000023,153.365393,-27.961722,13.0,2021-03-01,30725.9,2021-03-05,4.32363
4,20111,10672322914,6875.771616,5b6189f9-9973-4808-b0de-6ba6039a94f4,,telecom,,4214,1261357,Amy Jones,...,Female,4214,39314.000023,153.365393,-27.961722,13.0,2021-03-01,30725.9,2021-03-05,4.32363


### get the merchants rate range for `type`

In [51]:
merchants = pd.read_parquet('../data/curated/merchants_data.parquet')
merchants.groupby('type').describe()['take_rate']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
a,1602.0,6.232297,0.434689,5.5,5.85,6.23,6.6,7.0
b,1351.0,4.094056,0.589149,3.1,3.58,4.1,4.615,5.1
c,922.0,2.251204,0.495825,1.4,1.83,2.25,2.69,3.1
d,98.0,0.991224,0.261846,0.52,0.765,1.025,1.21,1.4
e,53.0,0.314717,0.103378,0.1,0.23,0.31,0.4,0.48


- a

In [52]:
null_merchant_pred[null_merchant_pred.take_rate > 5.50]

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,type,postcode,consumer_id,name,...,gender,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime,take_rate


In [53]:
null_merchant_pred[null_merchant_pred.take_rate > 5.10]

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,type,postcode,consumer_id,name,...,gender,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime,take_rate


- b

In [54]:
type_b = null_merchant_pred[null_merchant_pred.take_rate > 3.10]

In [55]:
type_b['type'] = 'b'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [56]:
(null_merchant_pred[null_merchant_pred.take_rate > 3.10]).index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            514055, 514056, 514057, 514058, 514059, 514060, 514061, 514062,
            514063, 514064],
           dtype='int64', length=514062)

In [57]:
rest = null_merchant_pred.drop((null_merchant_pred[null_merchant_pred.take_rate > 3.10]).index)

In [58]:
rest

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,type,postcode,consumer_id,name,...,gender,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime,take_rate
350493,21393,60945400664,13570.50821,614f7e65-1c7e-4de9-8e31-09f37b74ed29,,telecom,,7109,346984,Cindy Smith,...,Female,7109,12519.825456,146.919815,-43.161115,1466.0,2021-11-01,33345.0,2021-11-13,3.081884
350494,1553,60945400664,3587.360671,aa13793f-3a98-45c5-be1e-5e98f12f4c8a,,telecom,,2648,1349610,James Shannon,...,Male,2648,3946.628728,142.065957,-33.418082,3935.0,2021-12-01,31967.9,2021-12-19,3.081884
350495,10396,60945400664,9231.418174,5b976a52-c13d-4526-88ce-758c07f14888,,telecom,,2229,533327,Donald Myers,...,Male,2229,33256.921758,151.12242,-34.044515,277619.0,2022-01-01,32492.0,2022-01-30,3.081884


- c

In [59]:
type_c = rest[rest.take_rate > 1.40]

In [60]:
type_c['type'] = 'c'

In [61]:
rest = rest.drop((rest[rest.take_rate > 1.40]).index)

In [62]:
rest

Unnamed: 0,user_id,merchant_abn,dollar_value,order_id,merchant_name,tags,type,postcode,consumer_id,name,...,gender,POA_CODE21,2021_popu,latitude,longitude,New cases / day,month,total,order_datetime,take_rate


### concat

In [63]:
len(null_merchant_pred)

514065

In [64]:
temp = pd.concat([type_b, type_c])
len(temp)

514065

In [65]:
null_merchant_pred = temp

In [66]:
new_data = pd.concat([with_merchant, null_merchant_pred])

In [67]:
new_data.to_parquet('../data/curated/filled.parquet', index = False)

### record predicted merchant information

In [68]:
prediction.head()

Unnamed: 0,merchant_abn,take_rate,tag_int
0,10672322914,4.32363,22
1,11029341140,4.368066,22
2,11137507330,4.491005,21
3,11240426404,4.497794,18
4,11358147682,4.012999,22


In [69]:
prediction['tags'] = encoder.inverse_transform(prediction['tag_int'].values)
prediction.head()

Unnamed: 0,merchant_abn,take_rate,tag_int,tags
0,10672322914,4.32363,22,telecom
1,11029341140,4.368066,22,telecom
2,11137507330,4.491005,21,"stationery, office supplies and printing and w..."
3,11240426404,4.497794,18,"music shops - musical instruments, pianos, and..."
4,11358147682,4.012999,22,telecom


In [70]:
prediction = prediction.drop('tag_int', axis=1)

In [71]:
temp = null_merchant_pred[['merchant_abn', 'type']].head()

In [72]:
prediction.head()

Unnamed: 0,merchant_abn,take_rate,tags
0,10672322914,4.32363,telecom
1,11029341140,4.368066,telecom
2,11137507330,4.491005,"stationery, office supplies and printing and w..."
3,11240426404,4.497794,"music shops - musical instruments, pianos, and..."
4,11358147682,4.012999,telecom


In [73]:
prediction = pd.merge(prediction, temp, on='merchant_abn')
prediction.head()

Unnamed: 0,merchant_abn,take_rate,tags,type
0,10672322914,4.32363,telecom,b
1,10672322914,4.32363,telecom,b
2,10672322914,4.32363,telecom,b
3,10672322914,4.32363,telecom,b
4,10672322914,4.32363,telecom,b


In [74]:
merchants_data_total = pd.concat([merchants, prediction])
merchants_data_total.head()

Unnamed: 0,merchant_name,tags,merchant_abn,take_rate,type
0,Felis Limited,"furniture, home furnishings and equipment shop...",10023283211,0.18,e
1,Arcu Ac Orci Corporation,"cable, satellite, and other pay television and...",10142254217,4.22,b
2,Nunc Sed Company,"jewelry, watch, clock, and silverware shops",10165489824,4.4,b
3,Ultricies Dignissim Lacus Foundation,"watch, clock, and jewelry repair shops",10187291046,3.29,b
4,Enim Condimentum PC,"music shops - musical instruments, pianos, and...",10192359162,6.33,a


In [76]:
merchants_data_total.to_parquet('../data/curated/merchants_data_total.parquet', index=False)