In [1]:
import numpy as np 
import pandas as pd 
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
#from sklearn.inspection import permutation_importance

import time
import xgboost as xgb


import joblib

## Constants:

In [2]:
RANDOM_STATE = 84 

NAN_VALUE = -127

#k-folds:
FOLDS = 5

In [None]:
%%time
train = pd.read_parquet("data_V23_xgboostV10/train_processed.parquet")
train = train.sort_values("customer_ID").reset_index(drop=True)
#train = train.replace(np.nan, -127)
print(train.shape)
features = [col for col in train.columns if "target" not in col and "customer_ID" not in col]
gc.collect()

In [5]:
# from sklearn.model_selection import RandomizedSearchCV
# import xgboost
# from xgboost import XGBClassifier
# from sklearn.utils.fixes import loguniform

# params = {
#         "learning_rate": [0.01, 0.02],
#         "min_child_weight": [1, 5, 10],
#         "max_depth":[3, 4, 5, 6],
#         "max_delta_step":[0,1,5,10,20],
#         "subsample":[0.7, 0.8, 0.88],
#         "colsample_bytree":[0.2, 0.4, 0.6, 0.8],
#         "colsample_bylevel":[0.2, 0.4, 0.6, 0.8],
#         "reg_lambda": loguniform(1e-9,100.0),
#         "reg_alpha": loguniform(1e-9,100.0),
#         "gamma": loguniform(1e-9,0.5),
#         "scale_pos_weight": loguniform(1e-9,500.0),
#         }
# skf = StratifiedKFold(n_splits=FOLDS, shuffle = True, random_state = RANDOM_STATE)
# xgboost = XGBClassifier(n_estimators=1000, objective="binary:logistic",
#                     tree_method='gpu_hist',
#                     "eval_metric"="binary_logloss",
#                        )
# random_search = RandomizedSearchCV(xgboost, param_distributions=params, n_iter=10,
#                                    n_jobs=1, cv=skf.split(X,y), verbose=2, random_state=RANDOM_STATE )
# random_search.fit(X,y)

In [6]:
# random_search.best_params_

### AmEx metric:

In [7]:
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
# https://www.kaggle.com/code/rohanrao/amex-competition-metric-implementations
import numpy as np

def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)


def amex_metric(y_true, y_pred):
#     @MARTIN KOVACEVIC BUVINIC  ; https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7977
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)


In [8]:
# Set model parameters:
XGB_parameters = {
            "objective":"binary:logistic",
            "max_depth": 4,
            "alpha": 0.1,
            "gamma":1.5,
            "lambda":70,
            "learning_rate": 0.02, #(maybe to 0.01)
            "subsample":0.8,
           # "scale_pos_weight": 1,
            #"min_child_weight": 1,
            "colsample_bytree": 0.2, # for dart:0.20, or 0.6 #"feature_fraction";is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
#            "eval_metric":"auc", #binary_logloss
#              #dart param:
#             "booster": "dart",
#             "max_leaves": 100,
#             "min_child_weight": 40,
            "tree_method" : 'gpu_hist',  
            "gpu_id" :0,
            "predictor" : "gpu_predictor",
            "n_jobs" : -1,
        }              


In [9]:
# print('XGB Version',xgb.__version__)

In [10]:
gc.collect()

0

In [11]:
original_cat_col = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_col = [col + "_last" for col in original_cat_col]
cat_col += [col + "_2ndlast" for col in original_cat_col]

In [12]:
for col in train[features]:
        train[col] = train[col].astype(np.float32)

In [16]:
#%%time
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
acc_list = []
y_val_full = []
y_pred_full = []
customer_id_list = []


for fold, (train_index, val_index) in enumerate(skf.split(train[features], train["target"])):
 
#     x_train_fold, x_val_fold = X.iloc[train_index], X.iloc[val_index]
#     y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
    
    x_train_fold, x_val_fold = train[features].iloc[train_index], train[features].iloc[val_index]
    y_train_fold, y_val_fold = train["target"].iloc[train_index], train["target"].iloc[val_index]
    
    #### XGBoost
    print(f"XGBoost fold #: {fold}")
        
    train_matrix = xgb.DMatrix(data=x_train_fold, label=y_train_fold)
    val_matrix = xgb.DMatrix(data=x_val_fold, label=y_val_fold)
    
    del x_train_fold,y_train_fold, x_val_fold
    gc.collect()
    xgb_clf = xgb.train(XGB_parameters, 
                dtrain=train_matrix,
                evals=[(train_matrix,"train"),(val_matrix,"valid")],
                num_boost_round=15000,
                early_stopping_rounds=1500,
                verbose_eval=1000) 
    
    
    ### load model #####
#     xgb_clf = xgb.Booster()
#     xgb_clf.load_model(f"xgboost_v10/xgb_fold_{fold}.xgb")
    ####################

    y_pred = xgb_clf.predict(val_matrix)
#     acc = amex_metric_numpy(y_val_fold.to_pandas().values, y_pred) # for GPU
    acc = amex_metric_numpy(y_val_fold.values, y_pred) # for CPU
    
    
    acc_list.append(acc)
#     y_val_full.append(y_val_fold.to_pandas().to_numpy().ravel())# for GPU
    y_val_full.append(y_val_fold.to_numpy().ravel())# for CPU
    y_pred_full.append(y_pred)
    customer_id_list.append(train["customer_ID"].iloc[val_index])

    
    print(f"{fold}, acc: {acc}")
    xgb_clf.save_model(f"xgb_fold_{fold}.xgb")
    print(f"model: {fold} saved")
    
#     if fold == 0:
#         print("starting permutation importance")
#         result = permutation_importance(
#         xgb_clf, x_test_fold, y_test_fold, n_repeats=3, random_state=42, n_jobs=-1
#         )
#         print("finished, sorting:")
#         sorted_importances_idx = result.importances_mean.argsort()
#         print("to pandas")
# #         importances = pd.DataFrame(
# #             result.importances[sorted_importances_idx].T,
# #             columns=x_test_fold.columns[sorted_importances_idx],
# #         )
#         importances = pd.DataFrame({"column":x_test_fold.columns[sorted_importances_idx],
#                                    "importance":result.importances_mean[sorted_importances_idx].T})
#         print("saving:")
#         importances.to_excel("permutationimportance_repeats3_V1.xlsx", index=False)
#         print(importances.head())


    
    
    del xgb_clf, train_matrix, val_matrix, acc, y_pred, y_val_fold
    gc.collect()
    

    
# xgboost:
print(f"mean: {np.mean(acc_list)}")

# xgboost
y_val_full_array = np.concatenate(y_val_full, axis=None)
y_pred_full_array = np.concatenate(y_pred_full, axis=None)
acc_full = amex_metric_numpy(y_val_full_array,y_pred_full_array)
print(f"Acc over full dataset: {acc_full}")

customer_id_list_conc = np.concatenate(customer_id_list, axis=None)
df_OOF = pd.DataFrame()
df_OOF["customer_ID"] = pd.Series(customer_id_list_conc)
df_OOF["prediction"] = pd.Series(y_pred_full_array)
del y_val_full_array, y_pred_full_array,acc_full
gc.collect()




XGBoost fold #: 0
0, acc: 0.7946673169778943
model: 0 saved
XGBoost fold #: 1
1, acc: 0.794828952267217
model: 1 saved
XGBoost fold #: 2
2, acc: 0.7972735649909626
model: 2 saved
XGBoost fold #: 3
3, acc: 0.7946604227252587
model: 3 saved
XGBoost fold #: 4
4, acc: 0.7962572051743686
model: 4 saved
mean: 0.7955374924271401
Acc over full dataset: 0.795464785135856


0

In [17]:
df_OOF

Unnamed: 0,customer_ID,prediction
0,-9223189665817919541,0.001296
1,-9223188534444851899,0.024251
2,-9222977106653703082,0.689006
3,-9222795947410574988,0.987188
4,-9222571608979063563,0.749123
...,...,...
458908,9222830459409282183,0.010114
458909,9222865474092465587,0.000637
458910,9222877733476602020,0.000089
458911,9223073742590486866,0.000940


In [18]:
df_OOF.to_csv(f"xgboost_V10_data23_seed{RANDOM_STATE}_OOF.csv")

In [None]:
del train
gc.collect()

# test:

In [None]:
# clear GPU memory
# torch.cuda.empty_cache()
# cuda.select_device(0)
# cuda.close()
# cuda.select_device(0)
gc.collect()

In [None]:
# %%time 
chunks = 8
test_rows = 924621

test_submission = pd.DataFrame()

for chunk in range(chunks):

    print(f"chunk:{chunk+1}")
    test = pd.read_parquet(f"../input/amex-data-preparation/test_processed_chunk{chunk}.parquet")
#     test = test.sort_values(["customer_ID","S_2"])
#     test = test.reset_index(drop=True)
    #test = test.replace(np.nan, -127)
    print(f"Loaded chunk#{chunk+1}")
    gc.collect()
    test_submission_chunk = pd.DataFrame()
    test_submission_chunk["customer_ID"] = test["customer_ID"]
    test = test.drop(columns=["customer_ID"])
    for col in test:
        test[col] = test[col].astype(np.float32)
    # XGBoost:
    #test = np.array(test)
    test = xgb.DMatrix(data=test)#, missing=-127)
    predictions = []
    print("Starting predictions:")
    for fold in range(FOLDS):
        xgb_clf = xgb.Booster()
        xgb_clf.load_model(f"./xgb_fold_{fold}.xgb")
    
        test_pred = xgb_clf.predict(test)
        predictions.append(test_pred)
        
        print(f"prediction added for fold:{fold}")
        del xgb_clf, test_pred
        gc.collect()
    mean_predictions = np.mean(np.column_stack(predictions), axis=1)
    test_submission_chunk["prediction"] = mean_predictions
    test_submission = test_submission.append(test_submission_chunk)
    del test_submission_chunk
    gc.collect()
    
    
print("Finished predicting!")

In [None]:
# np.std(np.column_stack(predictions_lgb), axis=1)

In [None]:
len(test_submission)

In [None]:
test_submission.head(5)

In [None]:
# test_submission.describe()

In [None]:
# test_submission.sort_values("prediction", ascending=False)
# test_submission[test_submission["prediction"] < 0.1]

In [None]:
test_submission.to_csv("submission.csv", index=False)