In [1]:
import numpy as np 
import pandas as pd 
import gc

from sklearn.model_selection import StratifiedKFold

from time import time
from tqdm import tqdm
import lightgbm as lgb

import joblib


# import cudf 
#import torch
#from numba import cuda
#import missingno as msno

## Constants:

In [2]:
RANDOM_STATE = 84 

NAN_VALUE = -127

#k-folds:
FOLDS = 5

In [4]:
%%time
train = pd.read_parquet("data_v29/train_processed.parquet")
train = train.sort_values("customer_ID").reset_index(drop=True)
print(train.shape)

features = [col for col in train.columns if "target" not in col and "customer_ID" not in col]

gc.collect()

(458913, 2077)
CPU times: total: 31.1 s
Wall time: 16.7 s


0

### AmEx metric:

In [7]:
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
# https://www.kaggle.com/code/rohanrao/amex-competition-metric-implementations

def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)


def amex_metric(y_true, y_pred):
#     @MARTIN KOVACEVIC BUVINIC  ; https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7977
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return "amex_metric", amex_metric(y_true, y_pred), True

In [8]:
# Set model parameters:
       

LGBM_parameters = {
    "objective": "binary",
    "metric":"auc", #"binary_logloss",
    "boosting": "dart",
    "seed": RANDOM_STATE,
    "num_leaves":100,
    "learning_rate":0.01,
    "feature_fraction": 0.2,
    "bagging_freq":10,
    "bagging_fraction":0.5,
    "lambda_l2":2,
    "min_data_in_leaf": 40,
#     "max_bin":63,
    "n_jobs":-1,
#     "device":"gpu",
#     "gpu_platform_id":0,
#     "gpu_device_id": 0,
    
}

In [9]:
gc.collect()

0

In [10]:
original_cat_col = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_col = [col + "_last" for col in original_cat_col]
cat_col += [col + "_2ndlast" for col in original_cat_col]

In [11]:
train[cat_col] = train[cat_col].replace(-1, np.nan)

In [13]:
# def score_amex(estimator, X, y):
#     y_score = estimator.predict(X)  # You could also use the binary predict, but probabilities should give you a more realistic score.
#     return roc_auc_score(y, y_score)

In [14]:
# # Reporting util for different optimizers
# def report_perf(optimizer, X, y, title="model", callbacks=None):
#     """
#     A wrapper for measuring time and performances of different optmizers
    
#     optimizer = a sklearn or a skopt optimizer
#     X = the training set 
#     y = our target
#     title = a string label for the experiment
#     """
#     start = time()
    
#     if callbacks is not None:
#         optimizer.fit(X, y, callback=callbacks)
#     else:
#         optimizer.fit(X, y)
        
#     d=pd.DataFrame(optimizer.cv_results_)
#     best_score = optimizer.best_score_
#     best_score_std = d.iloc[optimizer.best_index_].std_test_score
#     best_params = optimizer.best_params_
    
#     print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
#            + u"\u00B1"+" %.3f") % (time() - start, 
#                                    len(optimizer.cv_results_['params']),
#                                    best_score,
#                                    best_score_std))    
#     print(f"Best parameters:{best_params}")
    
#     return best_params

In [15]:
# from sklearn.model_selection import RandomizedSearchCV
# import xgboost
# import lightgbm as lgb
# from skopt import BayesSearchCV
# from skopt.callbacks import DeadlineStopper, DeltaYStopper
# from skopt.space import Real, Categorical, Integer
# from sklearn.metrics import auc
# from sklearn.metrics import make_scorer
# # the kaggle book
# #https://www.kaggle.com/code/lucamassaron/scikit-optimize-for-lightgbm


# scorer = make_scorer(auc, greater_is_better=True, needs_threshold=True)
# params = {
#         "learning_rate": [0.01],
#         "n_estimators":[2500],
#         "max_depth":Integer(1,16), #int 1-16
#         "num_leaves": Integer(2,512),#int between 2 and 2^max_depth
#         "min_data_in_leaf": Integer(0,300), # int 0-300
#     #     "min_gain_to_split": , # float 0-15; avoid unnecessary tree splits and reduce overfitting
#         "max_bin": Integer(32,512), # int between 32 and 512; larger than default 255 risk of overfitting
#         "subsample": Real(0.01, 1.0, "uniform"), # real number between 0.01 and 1.0; portion of sample be used
#         "subsample_freq":Integer(0,10), # int between 0-10
#         "feature_fraction":Real(0.01, 1.0, "uniform"), # real between 0.1 and 1.0; features to be subsampled
#     #     "subsample_for_bin": Integer(30,1000000),# int 30 and number of samples
#         "reg_lambda":Real(1e-9, 100.0, "log-uniform"), # real number 0 and 100.0, log-uniform; L2
#         "reg_alpha": Real(1e-9, 100.0, "log-uniform"), #real number 0 and 100.0, log-uniform;l1
#         "scale_pos_weight": Real(1e-9, 500.0, "log-uniform"),# real 1e-6 and 500 log uniform; weights the pos against neg cases
#         }
# skf = StratifiedKFold(n_splits=FOLDS, shuffle = True, random_state = RANDOM_STATE)
# lgbm = lgb.LGBMClassifier(boosting_type="gbdt",
#                           metric="auc",
#                           objective="binary",
#                           random_state = RANDOM_STATE,
#                           n_jobs=-1,
#                           verbose=-1
#                        )
# BayesSearch = BayesSearchCV(estimator=lgbm,
#                             search_spaces=params,
# #                             scoring=scorer,
#                               n_iter=60,
# #                             n_points= 3, #number of hyperparameters evaluated at each time
#                               n_jobs=1,
# #                             iid=False,
#                             return_train_score=False,
#                             refit=False,
#                             optimizer_kwargs={"base_estimator":"GP"},
#                               cv=skf,
#                               verbose=2, random_state=RANDOM_STATE )

# deltaY = DeltaYStopper(delta=0.0001)
# time_limit = DeadlineStopper(total_time=60*60*12)

# best_params = report_perf(BayesSearch, X, y, "lgbm", callbacks=time_limit)

In [17]:
# %%time
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
acc_list_lgb = []
y_val_full_lgb = []
y_pred_full_lgb = []
customer_id_list = []
for fold, (train_index, val_index) in enumerate(skf.split(train[features], train["target"])):
#     if fold != 4:
#         continue

    if fold == 0:
        boost_rounds = 10900
    elif fold == 1:
        boost_rounds = 11100
    elif fold == 2:
        boost_rounds = 8000
    elif fold == 3:
        boost_rounds = 14600
    elif fold == 4:
        boost_rounds = 10800
        
    print(f"starting fold {fold}")
    
    ######
    x_train_fold, x_val_fold = train[features].iloc[train_index], train[features].iloc[val_index]
    y_train_fold, y_val_fold = train["target"].iloc[train_index], train["target"].iloc[val_index]
    
    #### LGBM:
    
    train_lgb = lgb.Dataset(x_train_fold, y_train_fold, categorical_feature = cat_col)
    val_lgb = lgb.Dataset(x_val_fold, y_val_fold, categorical_feature = cat_col)
    
    
    
    del x_train_fold,y_train_fold
    gc.collect()
    lgb_model = lgb.train(
                        params = LGBM_parameters,
                        train_set = train_lgb,
                        num_boost_round = boost_rounds,
                        valid_sets = [train_lgb, val_lgb],
                        callbacks=[lgb.early_stopping(1500)],
                        verbose_eval = 50,
                        feval = lgb_amex_metric
                        )
    # Save best model
    del train_lgb, val_lgb
    gc.collect()
    joblib.dump(lgb_model, f"results/lgbm_fold{fold}_seed{RANDOM_STATE}.pkl")
    print("model saved")



    ### for OOF: ####
#     lgb_model = joblib.load(f"/results/lgbm_fold{fold}_seed{RANDOM_STATE}.pkl")

    #################
    
    
    
    y_pred_lgb = lgb_model.predict(x_val_fold)
    acc_lgb = amex_metric_numpy(y_val_fold.values, y_pred_lgb)
    
    
    
    acc_list_lgb.append(acc_lgb)
    y_val_full_lgb.append(y_val_fold.to_numpy().ravel()) # acc over full dataset

    y_pred_full_lgb.append(y_pred_lgb)
    customer_id_list.append(train["customer_ID"].iloc[val_index])
    
    print(f"lgb: fold:{fold}, acc: {acc_lgb}")
    
        
    # Save best model
    del lgb_model, y_pred_lgb, acc_lgb, y_val_fold, x_val_fold
    gc.collect()
    #########
    

# lgb:
print(f"LGB: mean: {np.mean(acc_list_lgb)}")


# lgb
y_val_full_array_lgb = np.concatenate(y_val_full_lgb, axis=None)
y_pred_full_array_lgb = np.concatenate(y_pred_full_lgb, axis=None)
acc_full_lgb = amex_metric_numpy(y_val_full_array_lgb,y_pred_full_array_lgb)
print(f"LGB: Acc over full dataset: {acc_full_lgb}")
customer_id_list_conc = np.concatenate(customer_id_list, axis=None)
df_OOF = pd.DataFrame()
df_OOF["customer_ID"] = pd.Series(customer_id_list_conc)
df_OOF["prediction"] = pd.Series(y_pred_full_array_lgb)
del y_val_full_array_lgb, y_pred_full_array_lgb,acc_full_lgb, acc_list_lgb
gc.collect()

starting fold 0
lgb: fold:0, acc: 0.7963036426118163
starting fold 1
lgb: fold:1, acc: 0.7968672158469023
starting fold 2
lgb: fold:2, acc: 0.7983153352077621
starting fold 3
lgb: fold:3, acc: 0.7965792770116737
starting fold 4
lgb: fold:4, acc: 0.7978565809797978
LGB: mean: 0.7971844103315904
LGB: Acc over full dataset: 0.7971333793398161


0

In [18]:
df_OOF

Unnamed: 0,customer_ID,prediction
0,-9223189665817919541,0.001138
1,-9223188534444851899,0.018665
2,-9222977106653703082,0.648444
3,-9222795947410574988,0.990211
4,-9222571608979063563,0.759488
...,...,...
458908,9222830459409282183,0.006317
458909,9222865474092465587,0.000369
458910,9222877733476602020,0.000066
458911,9223073742590486866,0.000684


In [19]:
df_OOF.to_csv(f"LGBM_DART-V3_data29_seed{RANDOM_STATE}_OOF.csv")

In [None]:
#on dataV29: DART-V3
#fold 0; n_esti: 10900; best amex: 0.796504
#fold 1; n_esti: 11100; best amex: 0.798133
#fold 2; n_esti: 8000; best amex: 0.798726 
#fold 3; n_esti: 14600; best amex: 0.797246 
#fold 4; n_esti: 10800; best amex: 0.799269
# mean: 0.7979756 OOF: 0.7972244432729978
######################
# on data 32: DART-V4
# 15000: [0.7943137890316092,0.7964981155279881]
#fold 0; n_esti:13900 ; best amex: 0.795729
#fold 1; n_esti: 8150; best amex: 0.798049
#fold 2; n_esti:13500 ; best amex: 0.798215
#fold 3; n_esti:8450 ; best amex:  0.795595 
#fold 4; n_esti: 7650; best amex: 0.798923 
# mean:0.7973020241502867 oof: 0.7966134974634839



In [None]:
del train
gc.collect()

# test:

In [None]:
# clear GPU memory
# torch.cuda.empty_cache()
# cuda.select_device(0)
# cuda.close()
# cuda.select_device(0)
gc.collect()

In [None]:
# %%time 
chunks = 8
test_rows = 924621

test_submission = pd.DataFrame()
pseudo_test_df = pd.DataFrame()
for chunk in range(chunks):

    print(f"chunk:{chunk+1}")
    test = pd.read_parquet(f"data_v29/test_processed_chunk{chunk}.parquet")
#     test = test.sort_values(["customer_ID","S_2"])
#     test = test.reset_index(drop=True)
    print(f"Loaded chunk#{chunk+1}")
    
    gc.collect()
    test_submission_chunk = pd.DataFrame()
    test_submission_chunk["customer_ID"] = test["customer_ID"]
    test = test.drop(columns=["customer_ID"])
    test[cat_col] = test[cat_col].replace(-1, np.nan)
    
    #LGBM:
    
    predictions_lgb = []
    print("Starting predictions LGBM:")
    for fold in range(FOLDS):
        
        lgb_model_loaded = joblib.load(f"results/lgbm_fold{fold}_seed{RANDOM_STATE}.pkl")
        
        test_pred_lgb = lgb_model_loaded.predict(test)
        predictions_lgb.append(test_pred_lgb)
        
        print(f"prediction added for fold:{fold}")
        del lgb_model_loaded, test_pred_lgb
        gc.collect()
    mean_predictions_lgb = np.mean(np.column_stack(predictions_lgb), axis=1)
    std_predictions_lgb = np.std(np.column_stack(predictions_lgb), axis=1)
    test_submission_chunk["prediction"] = mean_predictions_lgb
    test_submission_chunk["std"] = std_predictions_lgb
    test_submission_chunk["chunk"] = chunk
    test_submission = test_submission.append(test_submission_chunk)
    del test_submission_chunk
    
    
    gc.collect()
    
    
print("Finished predicting!")

In [None]:
gc.collect()

In [None]:
# np.std(np.column_stack(predictions_lgb), axis=1)

In [None]:
len(test_submission)

In [None]:
test_submission

In [None]:
test_submission.describe()

In [None]:
test_submission = test_submission.drop(columns=["std","chunk"])

In [25]:
test_submission.to_csv("submission_dart_V3.csv", index=False)