In [1]:
from tqdm import tqdm_notebook, tnrange
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import BayesianRidge,LinearRegression
from h2o.estimators.gbm import H2OGradientBoostingEstimator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import h2o
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")

h2o.init()#开启h2o集群

%matplotlib

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_181"; Java(TM) SE Runtime Environment (build 1.8.0_181-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from /home/sjtu123/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpatkdlbsm
  JVM stdout: /tmp/tmpatkdlbsm/h2o_sjtu123_started_from_python.out
  JVM stderr: /tmp/tmpatkdlbsm/h2o_sjtu123_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Asia/Shanghai
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.1
H2O cluster version age:,3 months and 29 days !!!
H2O cluster name:,H2O_from_python_sjtu123_bvz1yu
H2O cluster total nodes:,1
H2O cluster free memory:,10.47 Gb
H2O cluster total cores:,20
H2O cluster allowed cores:,20


Using matplotlib backend: Qt5Agg


In [2]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

del df_additional_features,df_train_test_additional_features,df
gc.collect()

193

In [3]:
df_tsne_fea = pd.read_csv('./datasets/df_tsne_fea.csv')
df_data = df_data.merge(df_tsne_fea,on='card_id',how='left')
df_cate_statics = pd.read_csv('./datasets/df_cate_statics.csv')
df_data = df_data.merge(df_cate_statics,on='card_id',how='left')

del df_cate_statics,df_tsne_fea
gc.collect()

28

In [5]:
df_card_merchant_statics = pd.read_csv(DATA_PATH+'df_card_merchant_statics.csv')
df_card_merchant_vec = pd.read_csv(DATA_PATH+'df_card_merchant_vec.csv')
df_card_city_statics = pd.read_csv(DATA_PATH+'df_card_city_statics.csv')

df_data = df_data.merge(df_card_merchant_vec,on='card_id',how='left')
df_data = df_data.merge(df_card_merchant_statics,on='card_id',how='left')
df_data = df_data.merge(df_card_city_statics,on='card_id',how='left')

del df_card_merchant_statics,df_card_merchant_vec,df_card_city_statics
gc.collect()

42

In [8]:
df_data.fillna(-999,inplace=True)
df_data.replace([np.inf,-1*np.inf],-999,inplace=True)

In [9]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)
del df_data
gc.collect()

1525


In [None]:
label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
print(len(tr_features))

In [10]:
df_train = h2o.H2OFrame.from_python(df_train)
df_test = h2o.H2OFrame.from_python(df_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |████████████████████████████████████████████████████████████████| 100%


In [11]:
%%time

gbm_reg_params = {
    'learn_rate':0.01,
    'max_depth':8,
    'sample_rate':0.8,
    'col_sample_rate':0.9,
    'learn_rate_annealing':0.999,
    'nfolds':5,
    'ntrees':5000,
    'seed':20,
#     'fold_assignment':"stratified",
#     'fold_column':'is_outlier',
    'stopping_rounds':50,
    'stopping_metric':'rmse',
    'keep_cross_validation_fold_assignment':True,
    'keep_cross_validation_predictions':True,    
}
h2o_gbm = H2OGradientBoostingEstimator(**gbm_reg_params)

# h2o_gbm.fit(X=df_train[tr_features],y=df_train[label])
h2o_gbm.train(tr_features, 'target', training_frame=df_train, validation_frame=df_train)

gbm_test_pred = h2o_gbm.predict(test_data=df_test[tr_features]).as_data_frame()['predict'].values
gbm_train_pred = h2o_gbm.cross_validation_holdout_predictions().as_data_frame()['predict'].values

y_target = df_train[label].as_data_frame()[label].values.reshape(-1,)

score = np.sqrt((np.sum(np.square(gbm_train_pred - y_target))/df_train.shape[0]))

print("score = %s"%score)

gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
score = 3.656091680051238
CPU times: user 1min 32s, sys: 2.58 s, total: 1min 35s
Wall time: 5h 53min 57s


In [19]:
df_train['oof_gbm_pred_%.5f'%score] = h2o.H2OFrame.from_python(gbm_train_pred)
df_test['oof_gbm_pred_%.5f'%score] = h2o.H2OFrame.from_python(gbm_test_pred)
h2o.download_csv(data=df_train[['card_id','oof_gbm_pred_%.5f'%score]],filename='./datasets/stacking/level1/h2ogbm_train_pred_%.5f.csv'%score)
h2o.download_csv(data=df_test[['card_id','oof_gbm_pred_%.5f'%score]],filename='./datasets/stacking/levle1/h2ogbm_test_pred_%.5f.csv'%score)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

rf_reg_params = {
    'ntrees':8000,
    'max_depth':8,
    'col_sample_rate_per_tree':0.9,
    'sample_rate':0.9,
    'score_each_iteration':True,
    'stopping_metric':"rmse",
    'stopping_rounds':100,
    'keep_cross_validation_fold_assignment':True,
    'keep_cross_validation_predictions':True,
#     'fold_assignment':"stratified",
#     'fold_column':'is_outlier',
    'nfolds':5,    
    'seed':30,
}

h2o_rf = H2ORandomForestEstimator(**rf_reg_params)
h2o_rf.train(tr_features, 'target', training_frame=df_train, validation_frame=df_train)

rf_test_pred = h2o_rf.predict(test_data=df_test[tr_features]).as_data_frame()['predict'].values
rf_train_pred = h2o_rf.cross_validation_holdout_predictions().as_data_frame()['predict'].values


In [None]:
h2o_rf.cross_validation_metrics_summary()

In [None]:
y_target = df_train[label].as_data_frame()[label].values.reshape(-1,)
score = np.sqrt((np.sum(np.square(rf_train_pred - y_target))/df_train.shape[0]))
print("score = %s"%score)

In [None]:
df_train['oof_rf_pred_%.5f'%score] = h2o.H2OFrame.from_python(rf_train_pred)
df_test['oof_rf_pred_%.5f'%score] = h2o.H2OFrame.from_python(rf_test_pred)
h2o.download_csv(data=df_train[['card_id','oof_rf_pred_%.5f'%score]],filename='./datasets/stacking/h2orf_train_pred_%.5f.csv'%score)
h2o.download_csv(data=df_test[['card_id','oof_rf_pred_%.5f'%score]],filename='./datasets/stacking/h2orf_test_pred_%.5f.csv'%score)