In [1]:
from tqdm import tqdm_notebook, tnrange
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import BayesianRidge,LinearRegression
from h2o.automl import H2OAutoML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import h2o
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")

h2o.init()#开启h2o集群

%matplotlib

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_181"; Java(TM) SE Runtime Environment (build 1.8.0_181-b13); Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from /home/sjtu123/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpb827aauf
  JVM stdout: /tmp/tmpb827aauf/h2o_sjtu123_started_from_python.out
  JVM stderr: /tmp/tmpb827aauf/h2o_sjtu123_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Asia/Shanghai
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.1
H2O cluster version age:,3 months and 27 days !!!
H2O cluster name:,H2O_from_python_sjtu123_sj723t
H2O cluster total nodes:,1
H2O cluster free memory:,10.47 Gb
H2O cluster total cores:,20
H2O cluster allowed cores:,20


Using matplotlib backend: Qt5Agg


In [2]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv',dtype={'first_active_month':np.str})
df_train_test_additional_features = pd.read_csv(DATA_PATH+'df_train_test_features_additional.csv')
df_additional_features = pd.read_csv(DATA_PATH+'df_additional_features.csv')

df_data = df_data.merge(df_train_test_additional_features,on='card_id',how='left')
df_data = df_data.merge(df_additional_features,on='card_id',how='left')

path = './datasets/feature2/'
sublist = os.listdir(path)
for sub in sublist:
    df = pd.read_csv(path+sub)
    df_data = df_data.merge(df,on='card_id',how='left')

In [3]:
df_data.fillna(-999,inplace=True)
df_data.replace([np.inf,-1*np.inf],-999,inplace=True)

In [4]:
df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train['is_outlier'] = (df_train.target<-30).astype(np.int)

label = ['target']
dropCols = ['card_id','first_active_month','is_outlier','is_test','target','purchase_date','merchant_id']
tr_features = [_f for _f in df_train.columns if _f not in dropCols and df_train[_f].dtype!='object']
print(len(tr_features))

df_train = h2o.H2OFrame.from_python(df_train)
df_test = h2o.H2OFrame.from_python(df_test)

1113
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
%%time
automl_params = {
    'max_models':20,
    'nfolds':5,
#     'stopping_metric':'rmse',
    'stopping_rounds':5,
    'keep_cross_validation_predictions':True,
#     'sort_metric':'rmse',
    'seed':40,
}
aml = H2OAutoML(**automl_params)
aml.train(x=tr_features, y='target', training_frame=df_train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# aml_train_preds = aml.predict(df_train[tr_features]).as_data_frame()['predict'].values
aml_test_preds = aml.predict(df_test[tr_features]).as_data_frame()['predict'].values

In [None]:
y_target = df_train[label].as_data_frame()[label].values.reshape(-1,)
score = np.sqrt((np.sum(np.square(aml_train_preds - y_target))/df_train.shape[0]))
print("score = %s"%score)

In [None]:
df_train['oof_aml_pred_%.5f'%score] = h2o.H2OFrame.from_python(aml_train_preds)
df_test['oof_aml_pred_%.5f'%score] = h2o.H2OFrame.from_python(aml_test_preds)
h2o.download_csv(data=df_train['card_id','oof_aml_pred_%.5f'%score],filename='./datasets/stacking/h2oaml_train_pred_%s.csv'%score)
h2o.download_csv(data=df_test['card_id','oof_aml_pred_%.5f'%score],filename='./datasets/stacking/h2oaml_test_pred_%s.csv'%score)