In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

In [2]:
def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [3]:
LGBM_dartV3_OOF = pd.read_csv("../input/amex-dataforensenmble/LGBM_DART-V3_data29_seed84_OOF.csv")
LGBM_dartV3_OOF = LGBM_dartV3_OOF.sort_values("customer_ID")
LGBM_dartV3_OOF = LGBM_dartV3_OOF.drop(columns="Unnamed: 0").reset_index(drop=True)

LGBM_dartV4_OOF = pd.read_csv("../input/amex-dataforensenmble/LGBM_DART-V4_data32_seed84_OOF.csv")
LGBM_dartV4_OOF = LGBM_dartV4_OOF.sort_values("customer_ID")
LGBM_dartV4_OOF = LGBM_dartV4_OOF.drop(columns="Unnamed: 0").reset_index(drop=True)

tabnet_V1_OOF = pd.read_csv("../input/amex-dataforensenmble/TABNET_V1_data29_seed84_OOF.csv")
tabnet_V1_OOF = tabnet_V1_OOF.sort_values("customer_ID")
tabnet_V1_OOF = tabnet_V1_OOF.drop(columns="Unnamed: 0").reset_index(drop=True)

xgboost_V10_OOF = pd.read_csv("../input/amex-dataforensenmble/xgboost_V10_data23_seed84_OOF.csv")
xgboost_V10_OOF = xgboost_V10_OOF.sort_values("customer_ID")
xgboost_V10_OOF = xgboost_V10_OOF.drop(columns="Unnamed: 0").reset_index(drop=True)

ANN_V2_OOF = pd.read_csv("../input/amex-dataforensenmble/ANN_V2_data29_seed84_OOF.csv")
ANN_V2_OOF = ANN_V2_OOF.sort_values("customer_ID")
ANN_V2_OOF = ANN_V2_OOF.drop(columns="Unnamed: 0").reset_index(drop=True)

LGBM_V25_OOF = pd.read_csv("../input/amex-dataforensenmble/lgbm_v25_datav20_OOF.csv")
LGBM_V25_OOF = LGBM_V25_OOF.sort_values("customer_ID")
LGBM_V25_OOF = LGBM_V25_OOF.drop(columns="Unnamed: 0").reset_index(drop=True)

In [4]:
true_y_df = pd.read_parquet("../input/amex-data-preparation/train_processed.parquet")
true_y_df = true_y_df[["customer_ID","target"]]
true_y_df = true_y_df.sort_values("customer_ID")

In [5]:
OOF_dic = {"LGBM_dartV3_OOF":LGBM_dartV3_OOF,"LGBM_dartV4_OOF":LGBM_dartV4_OOF,
            "tabnet_V1_OOF":tabnet_V1_OOF,"xgboost_V10_OOF":xgboost_V10_OOF,
          "ANN_V2_OOF":ANN_V2_OOF,"LGBM_V25_OOF":LGBM_V25_OOF,
          }
print("Amex scores models:")
for OOF in OOF_dic:
    amex_score =amex_metric_numpy(true_y_df["target"],OOF_dic[OOF]["prediction"])
    print(f"{OOF}:{amex_score}")

Amex scores models:
LGBM_dartV3_OOF:0.7972244432729978
LGBM_dartV4_OOF:0.7966134974634839
tabnet_V1_OOF:0.7852546758090266
xgboost_V10_OOF:0.7954647856555098
ANN_V2_OOF:0.7877552113468742
LGBM_V25_OOF:0.7944904215932194


In [6]:
xgboost_V10_OOF.head()

Unnamed: 0,customer_ID,prediction
0,-9223358381327749917,0.642245
1,-9223193039457028513,0.000477
2,-9223189665817919541,0.001296
3,-9223188534444851899,0.024251
4,-9223173911659837606,0.853221


In [7]:
test_pred = (LGBM_dartV3_OOF["prediction"] * 0.2 + LGBM_dartV4_OOF["prediction"] * 0.325 +
            xgboost_V10_OOF["prediction"] * 0.325 + ANN_V2_OOF["prediction"] * 0.15 )
amex_metric_numpy(true_y_df["target"],test_pred)

#[0.2, 0.325, 0.325, 0.15] 0.79819828300309


0.79819828300309

In [8]:
best_amex_metric = 0
best_weight = []
weight_list = []
for a in tqdm(range(0,1000,25)):
    for b in range(0,1000,25):
        for c in range(0,1000,25):
            for d in range(0,300,15):
                if (a+b+c+d) == 1000:
                    weight_list.append([a/1000,b/1000,c/1000,d/1000])
                else:
                    pass
print("testing weights")
for weight in tqdm(weight_list):
    test_pred = (LGBM_dartV3_OOF["prediction"] * weight[0] + LGBM_dartV4_OOF["prediction"] * weight[1] +
                xgboost_V10_OOF["prediction"] * weight[2] + ANN_V2_OOF["prediction"] * weight[3])
    amex_metric = amex_metric_numpy(true_y_df["target"],test_pred)
    if amex_metric > best_amex_metric:
        best_amex_metric = amex_metric
        best_weight = weight
print(f"Best amex metric: {best_amex_metric}")
print(f"Best weights: {best_weight}")

100%|██████████| 40/40 [00:00<00:00, 133.87it/s]


testing weights


100%|██████████| 2757/2757 [06:21<00:00,  7.23it/s]

Best amex metric: 0.79819828300309
Best weights: [0.2, 0.325, 0.325, 0.15]





In [9]:
# weight_list

In [10]:
# 0.7978153797327203 0.6

In [11]:
xgboost_v10_submission = pd.read_csv("../input/amex-datafrom04aug/submission_XGBoostV10.csv")
lgbm_dart3_submission = pd.read_csv("../input/amex-dataforensenmble/submission_dart_V3.csv")
lgbm_dart4_submission = pd.read_csv("../input/amex-dataforensenmble/submission_dart_data32_V4.csv")
tabnet_submission = pd.read_csv("../input/amex-dataforensenmble/tabnet_submission_seed84_V1.csv")
ANN_V2_submission = pd.read_csv("../input/amex-dataforensenmble/ANN_V2_dataV29_submission.csv")
LGBM_V25_submission = pd.read_csv("../input/amex-dataforensenmble/lgbm_v25_submission.csv")

In [12]:
xgboost_v10_submission = xgboost_v10_submission.sort_values("customer_ID")
lgbm_dart3_submission = lgbm_dart3_submission.sort_values("customer_ID")
lgbm_dart4_submission = lgbm_dart4_submission.sort_values("customer_ID")
tabnet_submission = tabnet_submission.sort_values("customer_ID")
ANN_V2_submission = ANN_V2_submission.sort_values("customer_ID")
LGBM_V25_submission = LGBM_V25_submission.sort_values("customer_ID")


In [13]:
submission_df = pd.DataFrame()
submission_df["customer_ID"] = lgbm_dart3_submission["customer_ID"]

In [14]:
# submission_df

In [15]:
submission_df["prediction"] = (lgbm_dart3_submission["prediction"] * 0.2 + lgbm_dart4_submission["prediction"] * 0.325
           + xgboost_v10_submission["prediction"] * 0.325 + ANN_V2_submission["prediction"] * 0.15)
# [0.2, 0.325, 0.325, 0.15] 

In [16]:
submission_df

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.023483
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.001012
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.042073
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.214319
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.890982
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.012840
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.858027
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.432325
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.294664


In [17]:
submission_df.to_csv("submission.csv", index=False)