In [35]:
import pandas as pd
import numpy as np
from statistics import mean, stdev
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import metrics
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot
from time import time
from xgboost import plot_importance

In [36]:
#load data
train = pd.read_csv('train_1026.csv')
train

Unnamed: 0,user_id,seller_id,label,user_gender_female,user_gender_male,user_gender_unknown,user_age_0_18,user_age_18_24,user_age_25_29,user_age_30_34,...,user_seller_purchase_rt_db11,user_seller_add_to_favorite_rt_db11,user_seller_click_rt_db11_wk,user_seller_add_to_cart_rt_db11_wk,user_seller_purchase_rt_db11_wk,user_seller_add_to_favorite_rt_db11_wk,user_seller_click_rt_db11_mth,user_seller_add_to_cart_rt_db11_mth,user_seller_purchase_rt_db11_mth,user_seller_add_to_favorite_rt_db11_mth
0,34176,3906,0,1,0,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,34176,121,0,1,0,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,34176,4356,1,1,0,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,34176,2217,0,1,0,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,230784,4818,0,1,0,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260859,359807,4325,0,0,1,0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
260860,294527,3971,0,0,1,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
260861,294527,152,0,0,1,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
260862,294527,2537,0,0,1,0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [37]:
test = pd.read_csv('test_format1.csv')
test.head()

Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,
1,360576,1581,
2,98688,1964,
3,98688,3645,
4,295296,3361,


Replacing np.inf with 0 allows algorithms to handle such values without causing errors or bias.

In [38]:
#fill all infinite entry with 0
train = train.replace(np.inf, 0)
test = test.replace(np.inf, 0)

In [52]:
#data split (training_set, testing_set)
data_output = train.label
data_input = train.drop('label',axis=1)
#unbalanced
data_output.sum()/data_output.count()
print('In the original data, counts of label 1:',format(sum(data_output==1)))
print('In the original data, counts of label 0:',format(sum(data_output==0)))


In the original data, counts of label 1: 15952
In the original data, counts of label 0: 244912
In the original data, counts of label 2: 0
In the original data, counts of label 3: 0


In [53]:
train.columns

Index(['user_id', 'seller_id', 'label', 'user_gender_female',
       'user_gender_male', 'user_gender_unknown', 'user_age_0_18',
       'user_age_18_24', 'user_age_25_29', 'user_age_30_34', 'user_age_35_39',
       'user_age_40_49', 'user_age_50_', 'user_age_unknown', 'user_click_cnt',
       'user_add_to_cart_cnt', 'user_purchase_cnt', 'user_add_to_favorite_cnt',
       'user_action_total_cnt', 'user_purchase_rt', 'seller_click_cnt',
       'seller_add_to_cart_cnt', 'seller_purchase_cnt',
       'seller_add_to_favorite_cnt', 'seller_action_total_cnt',
       'seller_purchase_rt', 'user_seller_click_cnt',
       'user_seller_add_to_cart_cnt', 'user_seller_purchase_cnt',
       'user_seller_add_to_favorite_cnt', 'user_seller_action_total_cnt',
       'user_seller_purchase_rt', 'user_click_days', 'user_add_to_cart_days',
       'user_purchase_days', 'user_add_to_favorite_days',
       'user_action_total_days', 'user_purchase_day_rt', 'seller_click_days',
       'seller_add_to_cart_days',

In [40]:
training_set_x, testing_set_x, training_set_y, testing_set_y = train_test_split(data_input, data_output, test_size=0.3, random_state=1)
print(training_set_x.shape, testing_set_x.shape, training_set_y.shape, testing_set_y.shape)

(182604, 84) (78260, 84) (182604,) (78260,)


True- Finite
False-Infinite

In [41]:
np.isfinite(training_set_x).all()

user_id                                     True
seller_id                                   True
user_gender_female                          True
user_gender_male                            True
user_gender_unknown                         True
                                           ...  
user_seller_add_to_favorite_rt_db11_wk     False
user_seller_click_rt_db11_mth              False
user_seller_add_to_cart_rt_db11_mth        False
user_seller_purchase_rt_db11_mth           False
user_seller_add_to_favorite_rt_db11_mth    False
Length: 84, dtype: bool

In [42]:
training_set_x_id = training_set_x[['user_id','seller_id']]
testing_set_x_id = testing_set_x[['user_id', 'seller_id']]
training_set_x = training_set_x.drop(['user_id','seller_id'], axis=1)
testing_set_x = testing_set_x.drop(['user_id','seller_id'], axis=1)

In [43]:
testing_set_x_id.head()

Unnamed: 0,user_id,seller_id
10287,189342,4385
38400,342513,1608
231446,265001,246
88252,307588,1861
56734,118311,4808


In [44]:
print(training_set_x.shape)
print(testing_set_x.shape)

(182604, 82)
(78260, 82)


The predict_proba() method is a function that outputs the predicted probabilities of the input data belonging to each class.

In [45]:
#initial model fitting
#HistGradientBoostingClassifier
model = HistGradientBoostingClassifier()
#fit on training data
model.fit(training_set_x, training_set_y)
#predictions
rf_predictions = model.predict(testing_set_x)
rf_probs = model.predict_proba(testing_set_x)[:, 1]
#check model accuracy, ROC AUC
print('Accuracy:',metrics.accuracy_score(testing_set_y, rf_predictions))
print('ROC AUC:', metrics.roc_auc_score(testing_set_y, rf_probs))

Accuracy: 0.9381165346281626
ROC AUC: 0.5484945444889355


In [54]:
rf_predictions, rf_probs

(array([0, 0, 0, ..., 0, 0, 0]),
 array([0.05272593, 0.06175827, 0.04870698, ..., 0.06506829, 0.06506829,
        0.06506829]))

In [47]:
#Gradient Boosting
model_gbm = lgb.LGBMClassifier(objective='binary', metric='binary_logloss',learning_rate=0.03)
#fit on training data
model_gbm.fit(training_set_x, training_set_y)
#predictions
gbm_predictions = model_gbm.predict(testing_set_x)
gbm_probs = model_gbm.predict_proba(testing_set_x)[:, 1]
#check model accuracy, ROC AUC
print('Accuracy:', metrics.accuracy_score(testing_set_y, gbm_predictions))
print('ROC AUC', metrics.roc_auc_score(testing_set_y, gbm_probs))

Accuracy: 0.9381165346281626
ROC AUC 0.5487704231177624


In [55]:
np.mean(gbm_predictions)
np.mean(gbm_probs)

0.06079528285407838

In [49]:
#XGB
model_xgb = XGBClassifier(objective ='binary:logistic', learning_rate=0.1)
#fit on training data
model_xgb.fit(training_set_x, training_set_y)
#predictions
xgb_predictions = model_xgb.predict(testing_set_x)
xgb_probs = model_xgb.predict_proba(testing_set_x)[:, 1]
#check model accuracy, ROC AUC
print('Accuracy:', metrics.accuracy_score(testing_set_y, xgb_predictions))
print('ROC AUC', metrics.roc_auc_score(testing_set_y, xgb_probs))

Accuracy: 0.9381165346281626
ROC AUC 0.547347812054044


In [50]:
np.mean(xgb_predictions)

0.0