In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt 
import datetime

In [18]:
from sklearn.metrics import fbeta_score,roc_auc_score,precision_score,recall_score

In [6]:
class Config:
    pass
config = pd.read_pickle('config.pkl')
data_path = config.data_path
feature_path = config.feature_path
normalized_path = config.normalized_path

In [7]:
to_stack_path = '../../kaggleData/JD_logging/to_stack/'

### build feature for tree based models

In [4]:
features =   (config.feature_dict['trade_detail_feature']+
              config.feature_dict['recent_login_detail']+
              config.feature_dict['trade_and_recent_login_comparing']+
              config.feature_dict['login_trade_hist_stats']+
              config.feature_dict['llc_user_habbit']+
             config.feature_dict['hcc_user_habbit']+
              config.feature_dict['hcc_properties']+
            config.feature_dict['hcc_target_encoding']+
                    config.feature_dict['login_detail_new_features']+
              config.feature_dict['hcc_trade_properties']+
              config.feature_dict['hcc_mult_target_encoding']+
              config.feature_dict['hcc_user_trade_habbit'])

feature_sequence_list = []
for feature in features:
    feature_sequence_list.append(pd.read_pickle(feature_path+feature+'.pkl').reshape(-1,1))
    
trade_tt_mat = np.hstack(feature_sequence_list)
trade_tt_train = trade_tt_mat[config.train_2_6_index]
trade_tt_test =  trade_tt_mat[config.test_7_index]

validation_tuple_list = config.single_module_validation_indice_set
train_labels = pd.read_pickle(data_path+'trade_train_label.pkl')[config.train_2_6_index]

In [8]:
feature_sequence_list = []
for feature in config.normalized_features:
    feature_sequence_list.append(pd.read_pickle(normalized_path+feature+'.pkl').reshape(-1,1))
    
trade_tt_mat_normal = np.hstack(feature_sequence_list)
trade_tt_train_normal = trade_tt_mat[config.train_2_6_index]
trade_tt_test_normal =  trade_tt_mat[config.test_7_index]

In [58]:
trade_tt_df = pd.DataFrame(trade_tt_mat_normal[config.train_2_6_index+config.test_7_index],columns = config.normalized_features)

In [12]:
from sklearn.model_selection import KFold

In [60]:
KF = KFold(5,shuffle = True, random_state = 233)

In [11]:
#models
from mochi import runLGBM
from mochi import runXGB
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neighbors import KNeighborsClassifier 

In [62]:
for train_index, stack_index in KF.split(trade_tt_train_normal):
    
    dev_X = trade_tt_train[train_index]
    dev_y = train_labels.iloc[train_index]
    val_X = trade_tt_train[stack_index]
    
    dev_X_n = trade_tt_train_normal[train_index]
    val_X_n = trade_tt_train_normal[stack_index]
    
    #LGBM
    preds, _ = runLGBM(dev_X, dev_y, val_X, None,feature_names=None,verbose=100,eta=0.02,
                              early_stop=None,num_rounds=390,watch_dict=None,feval = None,
                              bagging_fraction=0.75,feature_fraction=0.25,num_leaves=64)
    
    trade_tt_df.loc[stack_index,'lgbm_stack'] = preds
    
    #XGB
    preds, _ = runXGB(dev_X, dev_y, val_X, feature_names=None,verbose_eval=100,eta=0.02,
                          early_stop=None,num_rounds=365,cv_dict=None,max_depth = 6,
                      subsample = 0.75,colsample_bytree = 0.25)
    
    trade_tt_df.loc[stack_index,'xgb_stack'] = preds
    
    #LogisticRegression
    logR = LogisticRegression(class_weight = 'balanced',n_jobs=7)
    logR.fit(dev_X_n,dev_y)
    trade_tt_df.loc[stack_index,'logR_stack'] = logR.predict(val_X_n)
    
    #LinearRegression
    liR = LinearRegression(n_jobs=7)
    liR.fit(dev_X_n,dev_y)
    trade_tt_df.loc[stack_index,'liR_stack'] = liR.predict(val_X_n)
    
    #RandomForestClassifier
    classifier = RFC(200,class_weight='balanced',random_state =33,n_jobs = -1,max_depth = None,
                        max_features = 'log2')
    classifier.fit(dev_X,dev_y)
    trade_tt_df.loc[stack_index,'rf_stack'] = classifier.predict(val_X)
    
    #KNeighborsClassifier
    kn_3 = KNeighborsClassifier(3,n_jobs=7)
    kn_5 = KNeighborsClassifier(5,n_jobs=7)
    
    kn_3.fit(dev_X_n,dev_y)
    kn_5.fit(dev_X_n,dev_y)
    
    trade_tt_df.loc[stack_index,'kn3_stack'] = kn_3.predict(val_X_n)
    trade_tt_df.loc[stack_index,'kn5_stack'] = kn_5.predict(val_X_n)

  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))
  " = {}.".format(self.n_jobs))


### predicting the test stack features

In [65]:
    dev_X = trade_tt_train
    dev_y = train_labels
    val_X = trade_tt_test
    
    dev_X_n = trade_tt_train_normal
    val_X_n = trade_tt_test_normal
    
    #LGBM
    preds, _ = runLGBM(dev_X, dev_y, val_X, None,feature_names=None,verbose=100,eta=0.02,
                              early_stop=None,num_rounds=390,watch_dict=None,feval = None,
                              bagging_fraction=0.75,feature_fraction=0.25,num_leaves=64)
    
    trade_tt_df.loc[config.test_start_stacking:,'lgbm_stack'] = preds
    
    #XGB
    preds, _ = runXGB(dev_X, dev_y, val_X, feature_names=None,verbose_eval=100,eta=0.02,
                          early_stop=None,num_rounds=365,cv_dict=None,max_depth = 6,
                      subsample = 0.75,colsample_bytree = 0.25)
    
    trade_tt_df.loc[config.test_start_stacking:,'xgb_stack'] = preds
    
    #LogisticRegression
    logR = LogisticRegression(class_weight = 'balanced',n_jobs=7)
    logR.fit(dev_X_n,dev_y)
    trade_tt_df.loc[config.test_start_stacking:,'logR_stack'] = logR.predict(val_X_n)
    
    #LinearRegression
    liR = LinearRegression(n_jobs=7)
    liR.fit(dev_X_n,dev_y)
    trade_tt_df.loc[config.test_start_stacking:,'liR_stack'] = liR.predict(val_X_n)
    
    #RandomForestClassifier
    classifier = RFC(200,class_weight='balanced',random_state =33,n_jobs = -1,max_depth = None,
                        max_features = 'log2')
    classifier.fit(dev_X,dev_y)
    trade_tt_df.loc[config.test_start_stacking:,'rf_stack'] = classifier.predict(val_X)
    
    #KNeighborsClassifier
    kn_3 = KNeighborsClassifier(3,n_jobs=7)
    kn_5 = KNeighborsClassifier(5,n_jobs=7)
    
    kn_3.fit(dev_X_n,dev_y)
    kn_5.fit(dev_X_n,dev_y)
    
    trade_tt_df.loc[config.test_start_stacking:,'kn3_stack'] = kn_3.predict(val_X_n)
    trade_tt_df.loc[config.test_start_stacking:,'kn5_stack'] = kn_5.predict(val_X_n)

  " = {}.".format(self.n_jobs))


### save the meta features

In [74]:
supervised_stack_set_1 = ['lgbm_stack','xgb_stack','logR_stack','liR_stack','rf_stack','kn3_stack','kn5_stack']

In [75]:
#new_trade_tt_df
for feature in supervised_stack_set_1:
    pd.to_pickle(trade_tt_df[feature],to_stack_path+feature+'.pkl')

In [3]:
class Config:
    pass
config = pd.read_pickle('config.pkl')

NameError: name 'pd' is not defined

In [77]:
config.to_stack_feature_dict.keys()

dict_keys(['3sigma_detect', 'tukey_detect', 'unsupervised_detect'])

In [80]:
config.to_stack_feature_dict.keys()

dict_keys(['3sigma_detect', 'tukey_detect', 'unsupervised_detect', 'supervised_stack_set_1'])

In [79]:
config.to_stack_feature_dict['supervised_stack_set_1'] = supervised_stack_set_1

In [81]:
pd.to_pickle(config,'config.pkl')

### combining the meta features with unsupervised ones

In [8]:
features =   (config.to_stack_feature_dict['supervised_stack_set_1']+
              config.to_stack_feature_dict['3sigma_detect']+
              config.to_stack_feature_dict['tukey_detect']+
              config.to_stack_feature_dict['unsupervised_detect'])
feature_sequence_list = []
for feature in features:
    feature_sequence_list.append(pd.read_pickle(to_stack_path+feature+'.pkl').reshape(-1,1))
    
trade_tt_mat = np.hstack(feature_sequence_list)
#trade_tt_mat[trade_tt_mat==-10]=np.nan

#validation_tuple_list = config.single_module_validation_indice_set
train_labels = pd.read_pickle(data_path+'trade_train_label.pkl')[config.train_2_6_index]

  import sys


In [9]:
trade_train_mat = trade_tt_mat[:config.test_start_stacking]
trade_test_mat = trade_tt_mat[config.test_start_stacking:]

In [19]:
val_KF = KFold(5,shuffle = True, random_state = 459)

cv_scores = []
cv_result = []
recall_scroes = []
precision_scores = []
models = []
preds_list = []
val_list = []

i=0
         
for train_index, stack_index in val_KF.split(trade_train_mat):
    result_dict = {}
    
    dev_X = trade_train_mat[train_index]
    dev_y = train_labels.iloc[train_index]
    val_X = trade_train_mat[stack_index]
    val_y = train_labels.iloc[stack_index]
    
    preds, model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=features,verbose_eval=100,eta=0.02,
                          early_stop=20,num_rounds=5000,cv_dict=result_dict,max_depth = 4,
                      subsample = 0.75,colsample_bytree = 0.75)
    
    result_f_beta  = fbeta_score( val_y,preds > 0.5, 0.1)
        
    cv_scores.append(result_f_beta)
    recall_scroes.append(recall_score(val_y,preds > 0.5))
    precision_scores.append(precision_score(val_y,preds > 0.5))
    preds_list.append(preds)
    val_list.append(val_y)
    
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print('f_beta score for the turn '+str(i)+' is '+str(result_f_beta))
    
print('The mean of the cv_scores is:',np.mean(cv_scores))


[0]	train-auc:0.972366	test-auc:0.97637
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[25]	train-auc:0.992566	test-auc:0.992743

f_beta score for the turn 1 is 0.923888231668
[0]	train-auc:0.978622	test-auc:0.97499
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
[100]	train-auc:0.993839	test-auc:0.991608
Stopping. Best iteration:
[105]	train-auc:0.993856	test-auc:0.991656

f_beta score for the turn 2 is 0.909813100684
[0]	train-auc:0.974116	test-auc:0.974984
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
Stopping. Best iteration:
[47]	train-auc:0.991587	test-auc:0.994734

f_beta score for the turn 3 is 0.916276111409
[0]	train-auc:0.982446	test-auc:0.981177
Multiple eval metrics have

In [20]:
recall_scroes

[0.8897893030794165,
 0.85521885521885521,
 0.84161490683229812,
 0.83614864864864868,
 0.84868421052631582]

In [21]:
precision_scores

[0.9242424242424242,
 0.91039426523297495,
 0.91708967851099832,
 0.90000000000000002,
 0.91166077738515905]

In [22]:
cv_scores

[0.92388823166769407,
 0.90981310068447008,
 0.91627611140867704,
 0.89932004604979143,
 0.91099146972451417]

### Test other threshold

In [44]:
new_recall_scores = []
new_precision_scores = []
new_f_beta = []

thres = 0.65
for i in range(5):
    new_f_beta.append(fbeta_score(val_list[i],preds_list[i] > thres, 0.1))
    new_precision_scores.append(precision_score(val_list[i],preds_list[i] > thres))
    new_recall_scores.append(recall_score(val_list[i],preds_list[i] > thres))

In [47]:
new_f_beta

[0.95141972816579201,
 0.92976474599386294,
 0.94926219783479171,
 0.96523938911271434,
 0.93187467048278982]

### Find best iteration

In [49]:
def f_beta_01_xgb(preds, train_data, threshold = 0.65):
    labels  = train_data.get_label()
    return 'fbeta_score_01',fbeta_score(labels, preds > threshold,0.1)

In [50]:
cv_scores = []
cv_result = []
models = []
i = 0

for train_index, stack_index in val_KF.split(trade_train_mat):
    result_dict = {}
    
    dev_X = trade_train_mat[train_index]
    dev_y = train_labels.iloc[train_index]
    val_X = trade_train_mat[stack_index]
    val_y = train_labels.iloc[stack_index]
    
    
    preds, model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=features,verbose_eval=100,eta=0.02,
                          early_stop=None,num_rounds=200,cv_dict=result_dict,max_depth = 4,
                      subsample = 0.75,colsample_bytree = 0.75,feval = f_beta_01_xgb)
    """
    preds, model = runXGB(dev_X, dev_y, val_X, val_y,feature_names=top_features,verbose_eval=100,eta=0.02,
                          early_stop=None,num_rounds=500,cv_dict=result_dict,feval = f_beta_01_xgb ,max_depth =4)
     """
    
    #result_f_beta = f_beta_01(val_y.values, preds>0.5)
    result_f_beta  = fbeta_score( val_y,preds > 0.65, 0.1)
    
    cv_scores.append(result_f_beta)
    cv_result.append(result_dict)
    models.append(model)
    i+=1
    print('f_beta score for the turn '+str(i)+' is '+str(result_f_beta))

print('The mean of the cv_scores is:',np.mean(cv_scores))

[0]	train-auc:0.972366	test-auc:0.97637	train-fbeta_score_01:0	test-fbeta_score_01:0


  'precision', 'predicted', average, warn_for)


[100]	train-auc:0.993571	test-auc:0.993394	train-fbeta_score_01:0.950444	test-fbeta_score_01:0.940808
[199]	train-auc:0.994391	test-auc:0.993283	train-fbeta_score_01:0.943472	test-fbeta_score_01:0.941787
f_beta score for the turn 1 is 0.941787426792
[0]	train-auc:0.978622	test-auc:0.97499	train-fbeta_score_01:0	test-fbeta_score_01:0
[100]	train-auc:0.993839	test-auc:0.991608	train-fbeta_score_01:0.943649	test-fbeta_score_01:0.934792
[199]	train-auc:0.994347	test-auc:0.991253	train-fbeta_score_01:0.940385	test-fbeta_score_01:0.927348
f_beta score for the turn 2 is 0.927347562111
[0]	train-auc:0.974116	test-auc:0.974984	train-fbeta_score_01:0	test-fbeta_score_01:0
[100]	train-auc:0.993053	test-auc:0.995329	train-fbeta_score_01:0.941708	test-fbeta_score_01:0.93905
[199]	train-auc:0.993428	test-auc:0.995355	train-fbeta_score_01:0.938708	test-fbeta_score_01:0.934193
f_beta score for the turn 3 is 0.934192840973
[0]	train-auc:0.982446	test-auc:0.981177	train-fbeta_score_01:0	test-fbeta_score

In [51]:
#finding the best iteration
pd_list = []
for dic in cv_result:
    pd_list.append(pd.DataFrame(dic['test']))
    
for i in range(len(pd_list)):
    pd_list[i].columns = pd_list[i].columns+'_'+str(i)
validation_result = pd.concat(pd_list,axis = 1)
validation_result['auc_avg'] = validation_result.apply(lambda x : np.mean([x.auc_0,x.auc_1,x.auc_2,x.auc_3,x.auc_4]),axis = 1)

validation_result['fbeta_avg'] = validation_result.apply(lambda x : np.mean([x.fbeta_score_01_0,x.fbeta_score_01_1,
                                                                     x.fbeta_score_01_2,x.fbeta_score_01_3,
                                                                    x.fbeta_score_01_4]),axis=1)

print(validation_result['auc_avg'].idxmax())
print(validation_result['fbeta_avg'].idxmax())

122
18


### generate result

In [54]:
train_X = trade_train_mat
test_X = trade_test_mat
train_y = train_labels

preds, model = runXGB(train_X, train_y, test_X,feature_names=None,verbose_eval=100,eta=0.02,
                          early_stop=None,num_rounds=100,cv_dict=result_dict,max_depth = 4,
                      subsample = 0.75,colsample_bytree = 0.75,feval = f_beta_01_xgb)

result_path = '../../kaggleData/JD_logging/result/'
test_rowkey = pd.read_pickle(data_path+'trade_test_rowkey.pkl')
pred_label = pd.Series(preds > 0.8)
result_set = pd.DataFrame(test_rowkey)
result_set['is_risk'] = pred_label.astype(int)

print(result_set.shape)

pd.to_pickle(pred_label,result_path+'stacking_sigma.pkl')
result_set.to_csv(result_path+'stacking_sigma.csv',index=False,header=False)

(17875, 2)
