In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import gc
from evaluation_metric import amex_metric
import h2o
from h2o.automl import H2OAutoML
import warnings
warnings.filterwarnings("ignore")

In [None]:
h2o.init(min_mem_size = '20G')

In [None]:
data = pd.read_parquet('Data/train_data_aggV3.parquet')
data.set_index('customer_ID', inplace=True)
train_data = data.drop(['target', 'cid', 'S_2'], axis=1)
train_labels = data['target']
del data
gc.collect()
train_data.shape, train_labels.shape

In [None]:
train_labels.value_counts()

In [None]:
train = h2o.H2OFrame(pd.concat([train_data, train_labels], axis=1))

In [None]:
del train_data, train_labels
gc.collect()

x = train.columns
y = 'target'
x.remove(y)

train[y] = train[y].asfactor()
train.shape

In [None]:
aml = H2OAutoML(balance_classes=True,
                max_runtime_secs = 8*60*60,
                nfolds=5,
                stopping_metric='AUCPR',
                stopping_rounds=3,
                sort_metric = "AUCPR",
                )

aml.train(x=x, y=y, training_frame=train)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows) 

In [None]:
best_model = aml.get_best_model()
model_path = h2o.save_model(model=best_model, path="Models/autoML_b1", force=True)
path="Models/autoML_b1/StackedEnsemble_AllModels_1_AutoML_1_20220615_151032"
model = h2o.load_model(path)

In [None]:
path="Models/autoML_b1/StackedEnsemble_AllModels_1_AutoML_1_20220615_151032"
model = h2o.load_model(path)

In [None]:
pred = model.predict(train)
pred_df = pred['p1'].as_data_frame()
train_targets = train['target'].as_data_frame()
amex_metric(train_targets, pred_df.rename({'p1': 'prediction'}, axis=1))

### Prediction

In [None]:
test = pd.read_parquet('Data/test_data_aggV3.parquet')
test.set_index('customer_ID', inplace=True)
test.drop(columns=['cid', 'S_2'], axis=1, inplace=True)
d_types = train.drop('target').types
test.shape

In [None]:
test_h2o = h2o.H2OFrame(test, column_types = d_types)

Closing connection _sid_a843 at exit
H2O session _sid_a843 closed.


In [None]:
pred_test = best_model.predict(test_h2o)
pred_test_df = pred_test['p1'].as_data_frame()
pred_test_df.index = test.index
pred_test_df

In [None]:
pred_test_df.to_csv('Output/p_autoML_b1.csv', header=['prediction'])