In [None]:
from functionUtils import *
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold,RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder
from feature_selector import FeatureSelector
from tqdm import tqdm_notebook, tnrange
from scipy.stats import ks_2samp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import lightgbm as lgb
import os
warnings.filterwarnings("ignore")
fea_path = './datasets/'
%matplotlib inline

###  I 数据加载

In [None]:
df_data = pd.read_csv(fea_path+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(fea_path+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(fea_path+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

del df_additional_features,df_train_test_additional_features
gc.collect()

In [None]:
path = './datasets/feature2/'
sublist = os.listdir(path)

for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')
del df
gc.collect()

In [None]:
df_tsne_fea = pd.read_csv('./datasets/df_tsne_fea.csv')
df_data = df_data.merge(df_tsne_fea,on='card_id',how='left')
df_cate_statics = pd.read_csv('./datasets/df_cate_statics.csv')
df_data = df_data.merge(df_cate_statics,on='card_id',how='left')

del df_cate_statics
gc.collect()

del df_tsne_fea
gc.collect()

In [None]:
df_cate_merchantCate_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchantCate_fm_feat.csv')
df_cate_merchant_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchant_fm_feat.csv')

df_data = df_data.merge(df_cate_merchantCate_fm,on='card_id',how='left')
df_data = df_data.merge(df_cate_merchant_fm,on='card_id',how='left')

del df_cate_merchantCate_fm,df_cate_merchant_fm
gc.collect()

In [None]:
df_card_merchant_statics = pd.read_csv(fea_path+'df_card_merchant_statics.csv')
df_card_merchant_vec = pd.read_csv(fea_path+'df_card_merchant_vec.csv')
df_card_city_statics = pd.read_csv(fea_path+'df_card_city_statics.csv')

df_data = df_data.merge(df_card_merchant_vec,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_statics,on='card_id',how='left')
df_data = df_data.merge(df_card_city_statics,on='card_id',how='left')

del df_card_merchant_statics,df_card_merchant_vec,df_card_city_statics
gc.collect()

In [None]:
df_nmf_card_merCate_features = pd.read_csv(fea_path+'df_nmf_card_merCate_features.csv')
df_nmf_card_city_features = pd.read_csv(fea_path+'df_nmf_card_city_features.csv')
df_card_merchant_features = pd.read_csv(fea_path+'df_card_merchant_features.csv')
df_cardid_vec = pd.read_csv(fea_path+'df_cardid_vec.csv')
# # df_card_merchant_vec1 = pd.read_csv('./datasets/df_card_merchant_vec1.csv')
# # df_f1_f2_f3_vec = pd.read_csv('./datasets/df_f1_f2_f3_vec.csv')

df_data = df_data.merge(df_nmf_card_merCate_features,on='card_id',how='left')
df_data = df_data.merge(df_nmf_card_city_features,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_features,on='card_id',how='left')
df_data = df_data.merge(df_cardid_vec,on='card_id',how='left')

del df_nmf_card_merCate_features,df_nmf_card_city_features,df_card_merchant_features,df_cardid_vec
gc.collect()

In [None]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]

In [None]:
df_train.shape,df_test.shape

In [None]:
label = ['target']
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
print(len(tr_features))

### Feature Selection

In [None]:
fs = FeatureSelector(data = df_train[tr_features], labels = df_train['target'].values)

fs.identify_missing(missing_threshold = 0.12)
fs.identify_collinear(correlation_threshold = 0.98)
fs.identify_zero_importance(task = 'regression', eval_metric = 'rmse', n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.99)
fs.identify_single_unique()

tr_removed = fs.remove(methods = 'all')

tr_features = list(tr_removed.columns)

len(tr_features)

### II 模型训练

#### regression model

In [None]:
%%time

def modelKFoldReg(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
#     for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train)):
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],
#                   sample_weight=list(w_train[dev_index]),eval_sample_weight=list(w_train[val_index]),
                  early_stopping_rounds=100,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features],num_iteration=model.best_iteration_)
        oof_train_pred[val_index] = model.predict(x_val,num_iteration=model.best_iteration_)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
        
    score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return model,score,oof_test_pred,oof_train_pred

# lgb_params={
#     'learning_rate': 0.01,
#     'objective':'regression',
#     'n_estimators':2000,
#     'metric':'rmse',
#     'num_leaves': 50,
#     "feature_fraction": 0.91,
#     "bagging_freq": 1,
#     "bagging_fraction": 0.92 ,
#     "bagging_seed": 11,
#     'verbose': 1,
#     "subsample": 0.8,
#     'lambda_l1':0.5,
# #     'categorical_feature':[0,1,2],
#     "colsample_bytree": 0.6,
#     "random_state":30,
#     'max_depth': 8,
#     'device': 'gpu',
#     'gpu_platform_id':1,
#     'gpu_device_id': 1,
# }
lgb_params ={
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators':2000,
    'learning_rate': 0.01,
    'subsample':0.78,
    'max_depth':8,
    'top_rate': 0.906,
    'num_leaves': 63,
    'min_child_weight': 41.9612,
    'other_rate': 0.072,
    'reg_alpha': 9.677,
    'colsample_bytree': 0.566,
    'min_split_gain': 8.820,
    'reg_lambda':9.253,
    'min_data_in_leaf': 21,
    'verbose': -1,
    'seed':20,
    'bagging_seed':42,
    'device': 'gpu',
    'gpu_platform_id':1,
    'gpu_device_id': 1,
}
    
#样本权重
# w_train = (0.05 * (df_train['target'].values < -30).astype('float32') + 1).ravel()

lgb_est = lgb.LGBMRegressor(**lgb_params)
lgb_est,score,lgb_test_pred,lgb_train_pred = modelKFoldReg(df_train,df_test,lgb_est)
# df_test['target'] = lgb_test_pred
# df_sub = df_test[['card_id','target']]
# # df_sub.to_csv('./submission/df_lgb_sub_%.5f.csv'%score,index=None)
# print(df_sub.shape)

In [None]:
print("score = %s"%score)

In [None]:
df_train['oof_lgb_pred_%.5f'%score] = lgb_train_pred
df_test['oof_lgb_pred_%.5f'%score] = lgb_test_pred
df_train[['card_id','oof_lgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_lgb_train_pred_%.5f.csv'%score,index=False)
df_test[['card_id','oof_lgb_pred_%.5f'%score]].to_csv('./datasets/stacking/level1/df_lgb_test_pred_%.5f.csv'%score,index=False)

df_train.drop(columns=['oof_lgb_pred_%.5f'%score],inplace=True)
df_test.drop(columns=['oof_lgb_pred_%.5f'%score],inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(12,28))
lgb.plot_importance(lgb_est,max_num_features=150, height=0.8, ax=ax)
ax.grid(False)
plt.title("LGBM - Feature Importance", fontsize=10)
plt.show()

In [None]:
fea_importance = lgb_est.feature_importances_
df_features = pd.DataFrame({'features':tr_features,'importance':fea_importance})
df_features.sort_values(by=['importance'],ascending=False,inplace=True)

In [None]:
df_features

In [None]:
dropimp = []
for col in df_features[df_features.importance<5].features.tolist():
    tr_features.remove(col)
    dropimp.append(col)
    
df_data.drop(columns=dropimp,inplace=True)
print(len(tr_features))
gc.collect()

In [None]:
gc.collect()

#### Ranker model

In [None]:
import xgboost  as xgb
def modelKFoldRanker(df_train,df_test,model):
    
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    ntest = df_test.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
#     for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train)):
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        
        print('x_dev'+str(x_dev.shape))
        print('x_val'+str(x_val.shape))
        q_dev = list((df_train['target'].iloc[dev_index]>-33).astype(np.int).values)
        q_val = list((df_train['target'].iloc[val_index]>-33).astype(np.int).values)
        print('q_dev ='+str(len(q_dev)))
        print('q_val = '+str(len(q_val)))

        model.fit(x_dev, y_dev,group=q_dev,eval_set=[(x_val,y_val)],eval_group=[q_val],
                  early_stopping_rounds=100,verbose=100)
        oof_test_pred_skf[foldIndex,:] = model.predict(df_test[tr_features],num_iteration=model.best_iteration_)
        oof_train_pred[val_index] = model.predict(x_val,num_iteration=model.best_iteration_)
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=0)
        
#     score = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    return oof_test_pred,oof_train_pred

ranker_params = {
    'max_depth':8,
    'learning_rate':0.01,
    'n_estimators':2000,
    'objective':'rank:pairwise',
    'class_weight':'balanced',
    'subsample':0.7,
    'random_state':42,
    'min_child_weight': 41.9612,
    'other_rate': 0.072,
    'reg_alpha': 9.677,
    'colsample_bytree': 0.566,
    'min_split_gain': 8.820,
    'reg_lambda':9.253,
    'min_data_in_leaf': 21,
    'verbose': -1,
    'seed':20,
    'bagging_seed':42,
    'device': 'gpu',
    'gpu_platform_id':1,
    'gpu_device_id': 1,
}
label = ['ranker']
df_train['ranker'] = df_train['target'].rank(method='min')

df_train['qid'] = (df_train['target']<-33).astype(np.int)
df_train.sort_values(by=['qid'],ascending=True)
q_dev = [199710,2207]

lgb_ranker = lgb.LGBMRanker(**ranker_params)
# q_dev = list((df_train['target']>-33).astype(np.int).values)

lgb_ranker.fit(df_train[tr_features],df_train[label],group=q_dev,early_stopping_rounds=100,verbose=100)

# oof_test_pred,oof_train_pred = modelKFoldRanker(df_train,df_test,lgb_ranker)


In [None]:
help(LGBMRanker)

#### 参数优化

In [None]:
import optuna

def objective(trial):
    NFOLDS = 5
    kfold = StratifiedKFold(n_splits=NFOLDS,shuffle=False,random_state=2018)
    ntrain = df_train.shape[0]
    oof_train_pred = np.zeros((ntrain,))
    
    lgb_params ={
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'n_estimators':2000,
        'learning_rate': 0.012157093610965607,
        'subsample': 0.9855,
        'max_depth':12,
        'top_rate': 0.9232941199074832,
        'num_leaves': 63,
        'min_child_weight': 43.00279828226643,
        'other_rate': 0.057739300172860754,
        'reg_alpha': 13.043379756014204,
        'colsample_bytree': 0.566,
        'min_split_gain': 8.820,
        'reg_lambda':19.619748271518752,
        'min_data_in_leaf': 21,
        'verbose': -1,
        'seed':20,
        'bagging_seed':42,
        'device': 'gpu',
        'gpu_platform_id':1,
        'gpu_device_id': 1,
    }
    if lgb_params['boosting_type'] == 'dart':
        lgb_params['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
        lgb_params['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    if lgb_params['boosting_type'] == 'goss':
        lgb_params['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
        lgb_params['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - lgb_params['top_rate'])
    lgb_params['learning_rate'] = trial.suggest_uniform('learning_rate',0.01,0.05)
    lgb_params['subsample'] = trial.suggest_uniform('subsample',0.5,1.0)
    lgb_params['max_depth'] = trial.suggest_int('max_depth',5,12)
    lgb_params['min_child_weight'] =trial.suggest_uniform('min_child_weight',35,50)
    lgb_params['reg_alpha'] =trial.suggest_uniform('reg_alpha',5,20)
    lgb_params['reg_lambda'] =trial.suggest_uniform('reg_lambda',5,20)

    lgb_est = lgb.LGBMRegressor(**lgb_params)
    
    for foldIndex,(dev_index,val_index) in enumerate(kfold.split(df_train,df_train['is_outlier'])):
        print("............第%s折..........."%(foldIndex+1))
        x_dev = df_train[tr_features].iloc[dev_index]
        y_dev = df_train[label].iloc[dev_index]
        x_val = df_train[tr_features].iloc[val_index]
        y_val = df_train[label].iloc[val_index]
        lgb_est.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],
                  early_stopping_rounds=100,verbose=100)
        oof_train_pred[val_index] = lgb_est.predict(x_val,num_iteration=lgb_est.best_iteration_)
        
    rmse = np.sqrt((np.sum(np.square(oof_train_pred - df_train[label].values.reshape(-1,)))/ntrain))
    
    return rmse

if __name__ == '__main__':
    study = optuna.create_study()
    study.optimize(objective, n_trials=5)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))