In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import gc
from evaluation_metric import amex_metric
from utils_model_metrics import *
from utils_model_metrics import WeightedFalseNegativeLossMetric
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import h2o

import warnings
warnings.filterwarnings("ignore")

In [None]:
h2o.init(min_mem_size = '20G')

In [None]:
data = pd.read_parquet('Data/train_data_aggV3.parquet')
data.set_index('customer_ID', inplace=True)
train_data = data.drop(['target', 'cid', 'S_2'], axis=1)
train_labels = data['target']
del data
gc.collect()
train_data.shape, train_labels.shape

In [None]:
train_labels.value_counts()

In [None]:
train = h2o.H2OFrame(pd.concat([train_data, train_labels], axis=1))

In [None]:
del train_data, train_labels
gc.collect()

x = train.columns
y = 'target'
x.remove(y)

train[y] = train[y].asfactor()
train.shape

In [None]:
weighted_false_negative_loss_func = h2o.upload_custom_metric(WeightedFalseNegativeLossMetric,
                                                 func_name = "WeightedFalseNegativeLoss",
                                                 func_file = "weighted_false_negative_loss.py")

In [None]:
aml = H2OGradientBoostingEstimator(ntrees=2000,
                                    nfolds=5,
                                    max_depth=0,
                                    sample_rate_per_class = [0.35, 1],
                                    score_each_iteration=True,
                                    custom_metric_func=weighted_false_negative_loss_func,
                                    stopping_metric="custom",
                                    stopping_rounds=10)

aml.train(x=x, y=y, training_frame=train)

In [None]:
model_path = h2o.save_model(model=aml, path="Models/gbm_b1", force=True)
model_path

In [None]:
pred = aml.predict(train)
pred_df = pred['p1'].as_data_frame()
train_targets = train['target'].as_data_frame()
amex_metric(train_targets, pred_df.rename({'p1': 'prediction'}, axis=1))

### Prediction

In [None]:
aml = h2o.load_model('Models/gbm_b1/GBM_model_python_1655233644697_1')

In [None]:
test = pd.read_parquet('Data/test_data_aggV3.parquet')
test.set_index('customer_ID', inplace=True)
test.drop(columns=['cid', 'S_2'], axis=1, inplace=True)
test.shape

In [None]:
d_types = train.drop('target').types
del train
gc.collect()

In [None]:
test_h2o = h2o.H2OFrame(test, column_types = d_types)

In [None]:
pred_test = aml.predict(test_h2o)
pred_test_df = pred_test['p1'].as_data_frame()
pred_test_df.index = test.index

In [None]:
pred_test_df

In [None]:
pred_test_df.to_csv('Output/p_gbm_b1.csv', header=['prediction'])

Closing connection _sid_94a9 at exit
H2O session _sid_94a9 closed.
