## Load Dataset

In [1]:
import pandas as pd
import numpy as np
pr = pd.read_csv('past_result.csv', "\t")
crm = pd.read_csv('crm.csv', "\t")
event = pd.read_csv('event_log.csv', "\t")
tran = pd.read_csv('transaction.csv', "\t")
pot = pd.read_csv('potential_users.csv', "\t")
crm = crm.sort_values(["customer_age", "arpu"])
crm2 = crm.groupby("customer_id").agg("last")


## Group by Customer id and Merchant Category and mean, std, median, sum of the GTV for each individual customer id

In [2]:
tt2 = tran.groupby(["customer_id", "MerchantCategory"])["GTV"].apply(lambda x: x.mean())
tt2 = tt2.reset_index()
tt2 = tt2.pivot(index = "customer_id", columns = "MerchantCategory", values = "GTV").fillna(0)
tt2.columns = [str(j) + '_mean' for j in tt2.columns]

In [3]:
tt3 = tran.groupby(["customer_id", "MerchantCategory"])["GTV"].apply(lambda x: np.std(x))
tt3 = tt3.reset_index()
tt3 = tt3.pivot(index = "customer_id", columns = "MerchantCategory", values = "GTV").fillna(0)
tt3.columns = [str(j) + '_std' for j in tt3.columns]

In [4]:
tt4 = tran.groupby(["customer_id", "MerchantCategory"])["GTV"].apply(lambda x: np.median(x))
tt4 = tt4.reset_index()
tt4 = tt4.pivot(index = "customer_id", columns = "MerchantCategory", values = "GTV").fillna(0)
tt4.columns = [str(j) + '_median' for j in tt4.columns]

In [5]:
tt5 = tran.groupby(["customer_id", "MerchantCategory"])["GTV"].apply(lambda x: np.max(x))
tt5 = tt5.reset_index()
tt5 = tt5.pivot(index = "customer_id", columns = "MerchantCategory", values = "GTV").fillna(0)
tt5.columns = [str(j) + '_max' for j in tt5.columns]

In [6]:
tt = tran.groupby(["customer_id", "MerchantCategory"])["GTV"].apply(lambda x: x.sum())
tt = tt.reset_index()

## Impute missing value of GTV with 0

In [7]:
tt = tt.pivot(index = "customer_id", columns = "MerchantCategory", values = "GTV").fillna(0)

## Merge potential User dataset with Customer Profile dataset by using left join

In [8]:
traindf = pd.merge(pr, crm2, left_on = 'customer_id', right_on = "customer_id", how = "left")
testdf = pd.merge(pot, crm2, left_on = 'customer_id', right_on = "customer_id", how = "left")


## Merge traindf dataset with transaction dataset by using left join. Then, drop Customer ID column and model column

In [9]:
traindf = pd.merge(traindf, tt5, left_on = 'customer_id', right_on = "customer_id", how = "left")
testdf = pd.merge(testdf, tt5, left_on = 'customer_id', right_on = "customer_id", how = "left")

In [10]:
traindf = pd.merge(traindf, tt4, left_on = 'customer_id', right_on = "customer_id", how = "left")
testdf = pd.merge(testdf, tt4, left_on = 'customer_id', right_on = "customer_id", how = "left")

In [11]:
traindf = pd.merge(traindf, tt3, left_on = 'customer_id', right_on = "customer_id", how = "left")
testdf = pd.merge(testdf, tt3, left_on = 'customer_id', right_on = "customer_id", how = "left")

In [12]:
traindf = pd.merge(traindf, tt2, left_on = 'customer_id', right_on = "customer_id", how = "left")
testdf = pd.merge(testdf, tt2, left_on = 'customer_id', right_on = "customer_id", how = "left")

In [13]:
traindf = pd.merge(traindf, tt, left_on = 'customer_id', right_on = "customer_id", how = "left").drop(["customer_id","model"],axis=1)
testdf = pd.merge(testdf, tt, left_on = 'customer_id', right_on = "customer_id", how = "left").drop(["customer_id","model"],axis=1)

## Impute the customer age column with mean.

In [14]:
traindf["customer_age"] = traindf["customer_age"].fillna(traindf["customer_age"].mean())
testdf["customer_age"] = testdf["customer_age"].fillna(testdf["customer_age"].mean())

## Convert the gender column to Boolean variable

In [15]:
gender_dict = dict(male = True, female = False)

In [16]:
traindf.tail()

Unnamed: 0,campaign_outcome,customer_age,arpu,gender,F&B_max,Household Goods and Groceries_max,Retail_max,Services_max,Transit & Travel_max,F&B_median,...,F&B_mean,Household Goods and Groceries_mean,Retail_mean,Services_mean,Transit & Travel_mean,F&B,Household Goods and Groceries,Retail,Services,Transit & Travel
5971,Not Interested,28.0,118.95,female,235.35,0.0,0.0,0.0,0.0,55.16,...,74.605789,0.0,0.0,0.0,0.0,1417.51,0.0,0.0,0.0,0.0
5972,Not Interested,38.0,75.0,female,880.68,564.11,71.75,67.53,0.0,30.37,...,121.5375,86.576364,52.26,67.53,0.0,1458.45,952.34,209.04,67.53,0.0
5973,Not Interested,20.0,148.78,female,149.79,88.39,139.1,159.38,51.34,40.56,...,59.533333,41.712,100.815,159.38,32.843333,535.8,208.56,201.63,159.38,295.59
5974,Neuron electric scooter,24.0,80.09,male,110.74,44.02,189.52,0.0,49.44,39.63,...,41.164545,28.27,65.304,0.0,27.605385,452.81,56.54,326.52,0.0,358.87
5975,Neuron electric scooter,27.0,105.75,male,118.83,48.87,15.28,0.0,47.77,47.47,...,51.939091,27.18,15.28,0.0,30.6075,571.33,54.36,15.28,0.0,367.29


In [17]:
testdf.head()

Unnamed: 0,customer_age,arpu,gender,F&B_max,Household Goods and Groceries_max,Retail_max,Services_max,Transit & Travel_max,F&B_median,Household Goods and Groceries_median,...,F&B_mean,Household Goods and Groceries_mean,Retail_mean,Services_mean,Transit & Travel_mean,F&B,Household Goods and Groceries,Retail,Services,Transit & Travel
0,22.0,71.95,male,128.35,0.0,182.59,592.59,0.0,42.16,0.0,...,52.582353,0.0,99.792,132.51,0.0,893.9,0.0,498.96,1060.08,0.0
1,35.117439,86.24,female,94.12,51.11,166.52,24.85,50.85,62.945,51.11,...,56.96,51.11,88.45,24.85,26.217692,227.84,51.11,353.8,24.85,340.83
2,49.0,53.36,female,134.94,115.57,165.09,139.53,0.0,51.85,35.34,...,54.035833,49.882727,69.60375,64.916667,0.0,648.43,548.71,556.83,389.5,0.0
3,49.0,79.64,male,179.73,0.0,0.0,195.51,0.0,74.04,0.0,...,70.8075,0.0,0.0,77.696667,0.0,1416.15,0.0,0.0,233.09,0.0
4,43.0,101.65,male,56.08,38.19,182.78,0.0,90.06,40.985,38.19,...,35.003333,38.19,131.24,0.0,30.722308,210.02,38.19,393.72,0.0,399.39


## Replace the Boolean variable to gender variable

In [18]:
traindf["gender"] = traindf["gender"].replace(gender_dict)
testdf["gender"] = testdf["gender"].replace(gender_dict)


## Replace the campaign outcome variable with the unique campaign outcome

In [19]:
labels = traindf['campaign_outcome'].unique()
label_replace = {val:index for index, val in enumerate(labels)}
traindf['campaign_outcome'] = traindf['campaign_outcome'].replace(label_replace)

In [20]:
X = traindf.iloc[:,1:]
y= traindf.iloc[:,0]

In [21]:
X.head()


Unnamed: 0,customer_age,arpu,gender,F&B_max,Household Goods and Groceries_max,Retail_max,Services_max,Transit & Travel_max,F&B_median,Household Goods and Groceries_median,...,F&B_mean,Household Goods and Groceries_mean,Retail_mean,Services_mean,Transit & Travel_mean,F&B,Household Goods and Groceries,Retail,Services,Transit & Travel
0,49.0,85.13,True,1158.95,0.0,0.0,0.0,0.0,48.49,0.0,...,123.646471,0.0,0.0,0.0,0.0,2101.99,0.0,0.0,0.0,0.0
1,28.0,189.2,True,906.98,77.13,162.28,74.56,0.0,34.09,47.465,...,87.544118,47.28,74.61,74.56,0.0,1488.25,189.12,373.05,74.56,0.0
2,40.0,64.08,False,733.77,0.0,0.0,0.0,0.0,57.2,0.0,...,150.496875,0.0,0.0,0.0,0.0,2407.95,0.0,0.0,0.0,0.0
3,30.0,131.87,False,141.09,197.53,177.37,98.35,0.0,28.04,33.12,...,44.22,55.174444,69.252727,48.293333,0.0,397.98,496.57,761.78,289.76,0.0
4,34.552137,117.33,True,161.6,31.35,998.43,0.0,0.0,31.47,29.29,...,50.258462,22.296667,312.748,0.0,0.0,653.36,66.89,1563.74,0.0,0.0


## Run Extreme Gradient Boosting

## Set the range of hyperparameters for XGBoost

In [22]:
import numpy as np
import xgboost as xgb

# dtrain = xgb.DMatrix(data=X, label=y)
# dtest = xgb.DMatrix(data=testdf)

gbm_param_grid = {
    'learning_rate': np.arange(0.05, 0.95, .025),
    'max_depth': np.arange(2,12, 1),
    'n_estimators': np.arange(2, 15, 1),
    'colsample_bytree': np.arange(0.7, 1, 0.02),

}

In [23]:
bst = xgb.XGBClassifier(objective='multi:softmax',
                       num_class= 3,
                       n_gpus= 0)

## Create a custom evaluation function - profit calculation from given profit matrix

In [24]:
from sklearn.metrics import make_scorer

In [25]:
def score_func(y, y_pred):
    profit_matrix = np.array([[70,-40,-10],[-110,330,-120],[-10,-30,20]])
    return np.sum([profit_matrix[(i+2)%3, (j+2)%3] for i,j in zip(y,y_pred)])

In [26]:
sf = make_scorer(score_func)

## Perform RandomizedSearchCV

In [27]:
# from sklearn.model_selection import RandomizedSearchCV
# randomized_roc_auc = RandomizedSearchCV(estimator=bst,
#                                         param_distributions=gbm_param_grid,
#                                         n_iter=5,scoring='accuracy', cv=50, verbose=2)

from sklearn.model_selection import RandomizedSearchCV
randomized_roc_auc = RandomizedSearchCV(estimator=bst,
                                        param_distributions=gbm_param_grid,
                                        n_iter=5,scoring=sf, cv=35, random_state = 123, verbose=2)

In [28]:
%%time
# Fit the estimator
baa=randomized_roc_auc.fit(X, y)

Fitting 35 folds for each of 5 candidates, totalling 175 fits
[CV] n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76, total=   1.9s
[CV] n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV]  n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76, total=   1.3s
[CV] n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76 
[CV]  n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76, total=   1.5s
[CV] n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76 
[CV]  n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76, total=   1.4s
[CV] n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76 
[CV]  n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76, total=   1.3s
[CV] n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76 
[CV]  n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0.76, total=   1.2s
[CV] n_estimators=10, max_depth=11, learning_rate=0.3500000000000001, colsample_bytree=0

[CV]  n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8, total=   0.4s
[CV] n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8 
[CV]  n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8, total=   0.4s
[CV] n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8 
[CV]  n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8, total=   0.4s
[CV] n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8 
[CV]  n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8, total=   0.4s
[CV] n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8 
[CV]  n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8, total=   0.4s
[CV] n_estimators=4, max_depth=8, learning_rate=0.9250000000000004, colsample_bytree=0.8 
[CV]  n_estimators=4, max

[CV]  n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76, total=   0.8s
[CV] n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76 
[CV]  n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76, total=   0.8s
[CV] n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76 
[CV]  n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76, total=   0.8s
[CV] n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76 
[CV]  n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76, total=   0.8s
[CV] n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76 
[CV]  n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76, total=   0.8s
[CV] n_estimators=13, max_depth=5, learning_rate=0.7500000000000002, colsample_bytree=0.76 
[CV] 

[CV]  n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002, total=   0.8s
[CV] n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002, total=   1.2s
[CV] n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002, total=   0.7s
[CV] n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002, total=   0.7s
[CV] n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=7, max_depth=5, learning_rate=0.15000000000000002, colsample_bytree=0.960

[CV]  n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002, total=   0.1s
[CV] n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002, total=   0.1s
[CV] n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002, total=   0.1s
[CV] n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002, total=   0.1s
[CV] n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.9600000000000002 
[CV]  n_estimators=2, max_depth=4, learning_rate=0.5000000000000002, colsample_bytree=0.960000000000

[Parallel(n_jobs=1)]: Done 175 out of 175 | elapsed:  2.5min finished


Wall time: 2min 30s


In [29]:
baa.best_params_

{'n_estimators': 10,
 'max_depth': 11,
 'learning_rate': 0.3500000000000001,
 'colsample_bytree': 0.76}

In [30]:
baa.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.76, gamma=0,
       learning_rate=0.3500000000000001, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=None, n_estimators=10, n_gpus=0,
       n_jobs=1, nthread=None, num_class=3, objective='multi:softprob',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

## Predict testdf by using XGBoost

In [2]:
pred = baa.best_estimator_.predict(testdf)


NameError: name 'baa' is not defined

In [32]:
pot["prediction"] = pred

In [33]:
label_rev = {val:key for key, val in label_replace.items()}
pot["prediction"] = pot["prediction"].replace(label_rev)
pot.to_csv("submit_3a_xgb_01.csv", index = False)

## Check confusion matrix (train)

In [34]:
from sklearn.metrics import confusion_matrix
trpred = baa.best_estimator_.predict(X)
cm = confusion_matrix(y_true=y, y_pred = trpred)
print(cm)

[[3723    9    0]
 [ 108 1957    1]
 [  14    9  155]]


## Check total profit (train)

In [35]:
prof_matrix = np.array([[70,-40,-10],[-110,330,-120],[-10,-30,20]])

In [36]:
np.sum(prof_matrix*cm)

896750

## Check predicted classes distribution

In [37]:
pd.DataFrame(pred).iloc[:,0].value_counts() #predicted

0    1733
1     781
2      47
Name: 0, dtype: int64

In [38]:
pd.DataFrame(y).iloc[:,0].value_counts() #actual

0    3732
1    2066
2     178
Name: campaign_outcome, dtype: int64

# Attempt on using LGBM Model

In [43]:
%%time

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# Do K-Fold Stratified Cross Validation (K = 7)
kfold = 7
kf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state = 123)

clf = lgb.LGBMClassifier(max_depth=12, learning_rate=0.0025, objective='multiclass',
                             random_state=88, silent=True, metric='None', 
                             n_jobs=4, n_estimators=4500, class_weight='balanced',
                             colsample_bytree =  0.91, min_child_samples = 80, num_leaves = 14, subsample = 0.96)

val_predictions_logloss = []
for train_index, val_index in kf.split(X, y):
    print("=======")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="logloss",
            early_stopping_rounds=400, verbose=200)
    val_predictions_logloss.append(clf.predict(X_val))
    
test_predictions_logloss = clf.predict_proba(testdf)
pred =test_predictions_logloss.argmax(axis = 1)


Training until validation scores don't improve for 400 rounds.
[200]	valid_0's multi_logloss: 0.784303
[400]	valid_0's multi_logloss: 0.624715
[600]	valid_0's multi_logloss: 0.538195
[800]	valid_0's multi_logloss: 0.488763
[1000]	valid_0's multi_logloss: 0.459387
[1200]	valid_0's multi_logloss: 0.439616
[1400]	valid_0's multi_logloss: 0.421921
[1600]	valid_0's multi_logloss: 0.408047
[1800]	valid_0's multi_logloss: 0.398734
[2000]	valid_0's multi_logloss: 0.392949
[2200]	valid_0's multi_logloss: 0.388066
[2400]	valid_0's multi_logloss: 0.384078
[2600]	valid_0's multi_logloss: 0.38107
[2800]	valid_0's multi_logloss: 0.379181
[3000]	valid_0's multi_logloss: 0.3776
[3200]	valid_0's multi_logloss: 0.375844
[3400]	valid_0's multi_logloss: 0.374739
[3600]	valid_0's multi_logloss: 0.374179
[3800]	valid_0's multi_logloss: 0.373246
[4000]	valid_0's multi_logloss: 0.372394
[4200]	valid_0's multi_logloss: 0.371325
[4400]	valid_0's multi_logloss: 0.370454
Did not meet early stopping. Best iteratio

In [44]:
pot["prediction"] = pred
label_rev = {val:key for key, val in label_replace.items()}
pot["prediction"] = pot["prediction"].replace(label_rev)
fname = "submit_3a_lgb_10.csv"
pot.to_csv(fname, index = False)
print("done_" + fname)

done_submit_3a_lgb_10.csv


### Check predicted classes distribution

In [45]:
pd.DataFrame(pred).iloc[:,0].value_counts() #predicted

0    1632
1     839
2      90
Name: 0, dtype: int64

In [46]:
pd.DataFrame(y).iloc[:,0].value_counts() #actual

0    3732
1    2066
2     178
Name: campaign_outcome, dtype: int64