In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.calibration import calibration_curve
import joblib
import os
import lightgbm as lgb

In [None]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
fold = 1

In [None]:
x_train = pd.read_pickle('Output/x_train_fold_{}.pkl'.format(fold))
y_train = pd.read_pickle('Output/y_train_fold_{}.pkl'.format(fold))
x_val = pd.read_pickle('Output/x_val_fold_{}.pkl'.format(fold))
y_val = pd.read_pickle('Output/y_val_fold_{}.pkl'.format(fold))

In [None]:
path = 'Models/'
for fname in os.listdir(path):
    if fname.startswith("fold_{}".format(fold)):
        print(fname)
        model = joblib.load('Models/' + fname)

In [None]:
data = model.predict(x_val)
data = pd.DataFrame(data, index = x_val.index, columns=['pred'])
data.shape

In [None]:
train = data.sample(frac=0.5, random_state=42)
val = data.drop(train.index)
train.shape, val.shape

In [None]:
model = LogisticRegressionCV(cv=5, random_state=0)
model.fit(train, y_val.loc[train.index])

In [None]:
test_calibrated = model.predict_proba(val)
test_calibrated = pd.DataFrame(test_calibrated[:, 1], index=val.index, columns=['pred_calibrated'])

In [None]:
gb_y_test, gb_x_test = calibration_curve(y_val.loc[val.index], val, n_bins=10)
gb_y_test_cal, gb_x_test_cal = calibration_curve(y_val.loc[test_calibrated.index], test_calibrated, n_bins=10)


plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(gb_x_test, gb_y_test, marker='.', lw=2, color='black', label='orginal', alpha=.8)
plt.plot(gb_x_test_cal, gb_y_test_cal, marker='.', lw=2, color='r', label='Calibrated', alpha=.8)
plt.legend(loc='lower right')
plt.show()

In [None]:
amex_metric(y_val.loc[val.index].to_numpy(), val['pred'].to_numpy())

In [None]:
amex_metric(y_val.loc[test_calibrated.index].to_numpy(), test_calibrated['pred_calibrated'].to_numpy())

In [None]:
pd.concat([val, test_calibrated], axis=1)

In [None]:
test_data = pd.read_pickle('Output/test.pkl')
test_data.shape

In [None]:
test_pred = model.predict(test_data)
pred_test_df = pd.DataFrame(test_pred, index=test_data.index, columns=['prediction'])

In [None]:
pred_test_df

In [None]:
pred_test_df.to_csv('Output/pred_test_lgb.csv')