In [None]:
import os
dir = '__pycache__/'
for f in os.listdir(dir):
    os.remove(os.path.join(dir, f))
if os.path.exists('log.txt'): os.remove('log.txt')

import pandas as pd
import gc
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

from custom_metric_function import AmexMetric
from evaluation_metric import amex_metric, amex_list

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.calibration import calibration_curve

import warnings
warnings.filterwarnings("ignore")

In [None]:
h2o.init(min_mem_size = '20G', nthreads = -1)

In [None]:
data = pd.read_parquet('Data/train_data_aggV3.parquet')
data.set_index('customer_ID', inplace=True)
data.drop(['cid', 'S_2'], axis=1, inplace=True)
data = data.sample(frac=1, random_state=42)
data_h2o = h2o.H2OFrame(data)

del data
gc.collect()

train, valid = data_h2o.split_frame(ratios=[.9], seed=42)
h2o.remove(data_h2o.frame_id)
gc.collect()

x = train.columns
y = 'target'
x.remove(y)

train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()

train.shape, valid.shape

In [None]:
train[y].as_data_frame().value_counts()

### Training

In [None]:
amex = h2o.upload_custom_metric(AmexMetric,
                                func_name = "Amex",
                                func_file = "custom_metric_function.py")

model = H2OGradientBoostingEstimator(model_id = 'GBM_b2',
                                    ntrees=9999,
                                    max_depth=0,
                                    nfolds=5,
                                    sample_rate_per_class = [0.35, 1],
                                    custom_metric_func = amex,
                                    stopping_metric='custom_increasing',
                                    stopping_tolerance=0.001,
                                    stopping_rounds=1000,
                                    calibrate_model = True,
                                    calibration_frame = valid)

model.train(x = x, y = y, training_frame = train)

In [None]:
model_path = h2o.save_model(model=model, path="Models/gbm_b2", force=True)
model_path

In [None]:
pred = model.predict(train)
pred_df = pred['cal_p1'].as_data_frame()
train_targets = train['target'].as_data_frame()
amex_metric(train_targets, pred_df.rename({'cal_p1': 'prediction'}, axis=1))

In [None]:
fraction_of_positives, mean_predicted_value = calibration_curve(train_targets, pred_df, n_bins=20)
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(mean_predicted_value, fraction_of_positives, 's-')
plt.plot([0, 1], [0, 1], '--', color='gray')

sns.despine(left=True, bottom=True)
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.title("Reliability Curve", fontsize=20); pass

### Prediction

In [None]:
test = pd.read_parquet('Data/test_data_aggV3.parquet')
test.set_index('customer_ID', inplace=True)
test.drop(columns=['cid', 'S_2'], axis=1, inplace=True)
d_types = train.drop('target').types

test_h2o = h2o.H2OFrame(test, column_types = d_types)

In [None]:
pred_test = model.predict(test_h2o)
pred_test_df = pred_test['p1'].as_data_frame()
pred_test_df.index = test.index

In [None]:
pred_test_df.hist(bins=100)

In [None]:
pred_test_df.to_csv('Output/p_gbm_b2.csv', header=['prediction'])