In [1]:
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import BayesianRidge,LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import datetime
import gc
import h2o
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")

h2o.init()#开启h2o集群

%matplotlib

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_181"; Java(TM) SE Runtime Environment (build 1.8.0_181-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from /home/sjtu123/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp2m268xre
  JVM stdout: /tmp/tmp2m268xre/h2o_sjtu123_started_from_python.out
  JVM stderr: /tmp/tmp2m268xre/h2o_sjtu123_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Asia/Shanghai
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.1
H2O cluster version age:,3 months and 27 days !!!
H2O cluster name:,H2O_from_python_sjtu123_auiwli
H2O cluster total nodes:,1
H2O cluster free memory:,10.47 Gb
H2O cluster total cores:,20
H2O cluster allowed cores:,20


Using matplotlib backend: Qt5Agg


In [2]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

df_data.fillna(-999,inplace=True)
df_data.replace([np.inf,-1*np.inf],-999,inplace=True)

In [3]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']

print(len(tr_features))

1113


In [4]:
df_train = h2o.H2OFrame.from_python(df_train)
df_test = h2o.H2OFrame.from_python(df_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
nfolds = 5
my_gbm = H2OGradientBoostingEstimator(nfolds=nfolds,seed = 20,keep_cross_validation_predictions=True)
my_gbm.train(x=tr_features, y='target', training_frame=df_train)

my_rf = H2ORandomForestEstimator(nfolds=nfolds,seed =20,keep_cross_validation_predictions=True)
my_rf.train(x=tr_features, y='target', training_frame=df_train)

In [16]:
stack = H2OStackedEnsembleEstimator(
    metalearner_nfolds=10,
    model_id="my_ensemb",
    training_frame=df_train, 
    base_models=[my_gbm.model_id, my_rf.model_id],
    seed = 20,
)
stack.train(x=tr_features, y='target', training_frame=df_train)


stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [31]:
stack.model_performance()


ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 11.16257465534262
RMSE: 3.3410439469337456
MAE: 1.488635967751284
RMSLE: NaN
R^2: 0.24713434131090073
Mean Residual Deviance: 11.16257465534262
Null degrees of freedom: 201916
Residual degrees of freedom: 201914
Null deviance: 2993779.2495507933
Residual deviance: 2253913.586682816
AIC: 1060161.8407017654




In [33]:
h2ostack_test_pred = stack.predict(test_data = df_test[tr_features]).as_data_frame()['predict'].values
h2ostack_train_pred = stack.predict(test_data = df_train[tr_features]).as_data_frame()['predict'].values

stackedensemble prediction progress: |████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


In [34]:
y_target = df_train[label].as_data_frame()[label].values.reshape(-1,)
score = np.sqrt((np.sum(np.square(h2ostack_train_pred - y_target))/df_train.shape[0]))
print("score = %s"%score)

score = 3.3410438389681127


In [36]:
df_train['h2ostack_pred%.5f'%score] = h2o.H2OFrame.from_python(h2ostack_train_pred)
df_test['h2ostack_pred%.5f'%score] = h2o.H2OFrame.from_python(h2ostack_test_pred)
h2o.download_csv(data=df_train[['card_id','h2ostack_pred%.5f'%score]],filename='./datasets/stacking/level1/h2ostack_train_pred_%.5f.csv'%score)
h2o.download_csv(data=df_test[['card_id','h2ostack_pred%.5f'%score]],filename='./datasets/stacking/level1/h2ostack_test_pred_%.5f.csv'%score)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
