In [None]:
from tqdm import tqdm_notebook, tnrange
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import BayesianRidge,LinearRegression
from h2o.automl import H2OAutoML
from h2o.estimators.xgboost import H2OXGBoostEstimator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import h2o
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")

h2o.init()#开启h2o集群

%matplotlib

In [None]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

In [None]:
df_cate_merchantCate_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchantCate_fm_feat.csv')
df_cate_merchant_fm = pd.read_csv('./datasets/cate_fm/df_hist_new_cate_merchant_fm_feat.csv')

df_data = df_data.merge(df_cate_merchantCate_fm,on='card_id',how='left')
df_data = df_data.merge(df_cate_merchant_fm,on='card_id',how='left')

In [None]:
df_nmf_card_merCate_features = pd.read_csv(DATA_PATH+'df_nmf_card_merCate_features.csv')
df_nmf_card_city_features = pd.read_csv(DATA_PATH+'df_nmf_card_city_features.csv')
df_card_merchant_features = pd.read_csv(DATA_PATH+'df_card_merchant_features.csv')

df_data = df_data.merge(df_nmf_card_merCate_features,on='card_id',how='left')
df_data = df_data.merge(df_nmf_card_city_features,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_features,on='card_id',how='left')

In [None]:
df_data.fillna(-999,inplace=True)
df_data.replace([np.inf,-1*np.inf],-999,inplace=True)

In [None]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
print(len(tr_features))

df_train = h2o.H2OFrame.from_python(df_train)
df_test = h2o.H2OFrame.from_python(df_test)

In [None]:
%%time
h2o_xgb_params = {
    'learn_rate':0.01,
    'ntrees':3000,
    'max_depth':8,
    'nfolds':5,
    'col_sample_rate':0.9,
    'col_sample_rate_per_tree':0.9,
    'sample_rate':0.9,
    'gamma':1.45,
    'reg_lambda':20.0,
#     'stopping_metric':'rmse',
    'stopping_rounds':50,
    'seed':40,
#     'tree_method':'hist',
#     'grow_policy':"lossguide",
    'keep_cross_validation_predictions':True,    
}

h2o_xgb = H2OXGBoostEstimator(**h2o_xgb_params)

h2o_xgb.train(tr_features, 'target', training_frame=df_train)

xgb_test_pred = h2o_xgb.predict(test_data = df_test[tr_features]).as_data_frame()['predict'].values
xgb_train_pred = h2o_xgb.cross_validation_holdout_predictions().as_data_frame()['predict'].values

In [None]:
h2o_xgb.cross_validation_metrics_summary()

In [None]:
y_target = df_train[label].as_data_frame()[label].values.reshape(-1,)
score = np.sqrt((np.sum(np.square(xgb_train_pred - y_target))/df_train.shape[0]))
print("score = %s"%score)

In [None]:
df_train['oof_h2oxgb_pred_%.5f'%score] = h2o.H2OFrame.from_python(xgb_train_pred)
df_test['oof_h2oxgb_pred_%.5f'%score] = h2o.H2OFrame.from_python(xgb_test_pred)
h2o.download_csv(data=df_train[['card_id','oof_h2oxgb_pred_%.5f'%score]],filename='./datasets/stacking/level1/h2oxgb_train_pred_%.5f.csv'%score)
h2o.download_csv(data=df_test[['card_id','oof_h2oxgb_pred_%.5f'%score]],filename='./datasets/stacking/levle1/h2oxgb_test_pred_%.5f.csv'%score)