In [None]:
import warnings
warnings.filterwarnings("ignore")
import cudf
cudf.set_allocator("managed")
import pandas as pd
import matplotlib.pylab as plt
import cupy
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from xgboost import plot_importance
import numpy as np
from tqdm import tqdm
import pickle
import gc

### Feature Engineering

In [None]:
def get_not_used():
    # cid is the label encode of customer_ID
    # row_id indicates the order of rows
    return ['row_id', 'customer_ID', 'target', 'cid', 'S_2']
    
def preprocess(df):
    df['row_id'] = cupy.arange(df.shape[0])
    not_used = get_not_used()
    cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

    for col in df.columns:
        if col not in not_used+cat_cols:
            df[col] = df[col].round(2)

    # compute "after pay" features
    for bcol in [f'B_{i}' for i in [11,14,17]]+['D_39','D_131']+[f'S_{i}' for i in [16,23]]:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]

    df['S_2'] = cudf.to_datetime(df['S_2'])
    df['cid'], _ = df.customer_ID.factorize()
        
    num_cols = [col for col in df.columns if col not in cat_cols+not_used]
    
    dgs = add_stats_step(df, num_cols)
        
    # cudf merge changes row orders
    # restore the original row order by sorting row_id
    df = df.sort_values('row_id')
    df = df.drop(['row_id'],axis=1)
    return df, dgs

def add_stats_step(df, cols):
    n = 50
    dgs = []
    for i in range(0,len(cols),n):
        s = i
        e = min(s+n, len(cols))
        dg = add_stats_one_shot(df, cols[s:e])
        dgs.append(dg)
    return dgs

def add_stats_one_shot(df, cols):
    stats = ['mean','std']
    dg = df.groupby('customer_ID').agg({col:stats for col in cols})
    out_cols = []
    for col in cols:
        out_cols.extend([f'{col}_{s}' for s in stats])
    dg.columns = out_cols
    dg = dg.reset_index()
    return dg

def load_test_iter(chunks=4):
    
    test_rows = 11363762
    chunk_rows = test_rows // chunks
    
    test = cudf.read_parquet('Data/test.parquet',
                             columns=['customer_ID','S_2'],
                             num_rows=test_rows)
    test = get_segment(test)
    start = 0
    while start < test.shape[0]:
        if start+chunk_rows < test.shape[0]:
            end = test['cus_count'].values[start+chunk_rows]
        else:
            end = test['cus_count'].values[-1]
        end = int(end)
        df = cudf.read_parquet('Data/test.parquet',
                               num_rows = end-start, skiprows=start)
        start = end
        yield process_data(df)
    

def load_data(data, type):    
    data = process_data(data)
    if type == 'train':
        trainl = cudf.read_csv('Data/train_labels.csv')
        data = data.merge(trainl, on='customer_ID', how='left')
    return data

def process_data(df):
    df,dgs = preprocess(df)
    df = df.drop_duplicates('customer_ID',keep='last')
    for dg in dgs:
        df = df.merge(dg, on='customer_ID', how='left')
    diff_cols = [col for col in df.columns if col.endswith('_diff')]
    df = df.drop(diff_cols,axis=1)
    return df

def get_segment(test):
    dg = test.groupby('customer_ID').agg({'S_2':'count'})
    dg.columns = ['cus_count']
    dg = dg.reset_index()
    dg['cid'],_ = dg['customer_ID'].factorize()
    dg = dg.sort_values('cid')
    dg['cus_count'] = dg['cus_count'].cumsum()
    
    test = test.merge(dg, on='customer_ID', how='left')
    test = test.sort_values(['cid','S_2'])
    assert test['cus_count'].values[-1] == test.shape[0]
    return test

### XGB Params and utility functions

In [None]:
def xgb_train(x, y, xt, yt, seed):
    print("# of features:", x.shape[1])
    assert x.shape[1] == xt.shape[1]
    dtrain = xgb.DMatrix(data=x, label=y)
    dvalid = xgb.DMatrix(data=xt, label=yt)
    params = {
            'objective': 'binary:logistic', 
            'tree_method': 'hist', 
            'max_depth': 7,
            'subsample':0.88,
            'colsample_bytree': 0.5,
            'gamma':1.5,
            'min_child_weight':8,
            'lambda':70,
            'eta':0.03,
            'random_state':seed
    }
    watchlist = [(dvalid, 'eval')]
    bst = xgb.train(params, dtrain=dtrain,
                num_boost_round=9999,evals=watchlist,
                early_stopping_rounds=1000, feval=xgb_amex, maximize=True,
                verbose_eval=200)
    print('best ntree_limit:', bst.best_ntree_limit)
    print('best score:', bst.best_score)
    pred = bst.predict(dvalid, iteration_range=(0,bst.best_ntree_limit))

    return pred, bst

#### Metrics

In [None]:
def xgb_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())

# Created by https://www.kaggle.com/yunchonggan
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)

    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)

    g = gini / gini_max
    return 0.5 * (g + d)

# we still need the official metric since the faster version above is slightly off
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

### Load data and add feature

In [None]:
train = cudf.read_parquet('Data/train_data_aggV3.parquet')
train.shape

### Train XGB in K-folds

In [None]:
%%time

not_used = get_not_used()
not_used = [i for i in not_used if i in train.columns]
msgs = {}
folds = 10
seeds = 4
score = 0
models = {}
df_scores = []

for i in range(folds):
    mask = train['cid']%folds == i
    tr,va = train[~mask], train[mask]
    
    x, y = tr.drop(not_used, axis=1), tr['target']
    xt, yt = va.drop(not_used, axis=1), va['target']
    for seed in range(seeds):
        key = str(i)+'-'+str(seed)
        yp, bst = xgb_train(x, y, xt, yt, seed)
        models[key] = bst
        amex_score = amex_metric(pd.DataFrame({'target':yt.values.get()}), 
                                        pd.DataFrame({'prediction':yp}))
        df_scores.append((i, seed, amex_score))
        msg = f"Fold {i}-Seed {seed}:  amex {amex_score:.4f}"

        print(msg)

In [None]:
pickle.dump(models, open("Models/xgboost_b1.pkl", "wb"))

In [None]:
fig, ax = plt.subplots(1,1,figsize=(14, 20))
plot_importance(models['1-1'],max_num_features=50, ax = ax)
plt.plot()

In [None]:
del train
del tr,va

### Prediction

In [None]:
models = pickle.load(open("Models/xgboost_b1.pkl", "rb"))

In [None]:
train = cudf.read_parquet('Data/train_data_aggV3.parquet')
train_labels = pd.read_pickle('Data/train_labels.pkl').loc[train.set_index('customer_ID').to_pandas().index]
train_features = train.drop(columns=['customer_ID', 'target', 'cid', 'S_2'], axis=1)
train_features = xgb.DMatrix(data=train_features)

In [None]:
prediction_list = []
for key in models.keys():
    print(key, end=',')
    model = models[key]
    prediction_list.append(model.predict(train_features, iteration_range = (0,model.best_iteration)))

train_prediction_df = pd.DataFrame(prediction_list).T
train_prediction_df.columns = models.keys()
train_prediction_df.index = train.set_index('customer_ID').to_pandas().index
train_prediction_df.to_pickle('Output/p_train_xgb_b1.pkl')

In [None]:
test = cudf.read_parquet('Data/test_data_aggV3.parquet')

In [None]:
test_features = test.drop(columns=['customer_ID', 'cid', 'S_2'], axis=1)
test_features = xgb.DMatrix(data=test_features)

In [None]:
prediction_list = []
for key in models.keys():
    print(key, end=',')
    model = models[key]
    prediction_list.append(model.predict(test_features, ntree_limit=model.best_iteration))

test_prediction_df = pd.DataFrame(prediction_list).T
test_prediction_df.columns = models.keys()
test_prediction_df.index = test.set_index('customer_ID').to_pandas().index

In [None]:
test_prediction_df.to_pickle('Output/p_test_xgb_b1.pkl')

In [None]:
test_prediction_df[['0-3','1-1','2-2', '3-1', '4-2', '5-0', '6-1', '7-3', '8-1', '9-1']]

In [None]:
test_prediction_df[['0-3','1-1','2-2', '3-1', '4-2', '5-0', '6-1', '7-3', '8-1', '9-1']].mean(axis=1).to_csv('Output/p_xgb_b1_best.csv', header=['prediction'])