In [None]:
import numpy as np
import pandas as pd
import random as rnd
import json
import copy

import xgboost as xgb
SEED = 358

#paths shown by kaggle
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
##define scoring function
#taken from https://www.kaggle.com/code/inversion/amex-competition-metric-python
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## Getting train data 

In [None]:
##function for preparing the data
def data_prep(path_data = '', path_target = ''):
    #load data
    data = pd.read_parquet(path_data).set_index('customer_ID') 
    
    #create a mapping dictionary {cust_id: integer}
    cust_id_map = {}
    for (i, j) in list(enumerate(data.index.unique())):
        cust_id_map.update({j: i}) # getting dictionary {original ID: new ID}
        
    #setting integer ID as index
    data.index = data.index.map(cust_id_map)
    
    #converting do datetime
    data.S_2 = pd.to_datetime(data.S_2)
    print('Shape of dataset', data.shape)
    
    #merge with target
    target = pd.read_csv(path_target).set_index('customer_ID')
    target.index = target.index.map(cust_id_map) #again, map index to integers
    data = data.merge(target, left_index=True, right_index=True) #merge the target column to train df  
    
    #Keeping only the latest observation for each customer (row with max date)
    idx = data.groupby(['customer_ID'])['S_2'].transform(max) == data['S_2']
    data = data[idx]
    print('Shape of dataset after aggregating dates', data.shape)
    
    return (data, cust_id_map)
    

In [None]:
##define paths
train_path = '/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet'
labels_path = '/kaggle/input/amex-default-prediction/train_labels.csv'

test_path = '/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet'
sample_sub_path = '/kaggle/input/amex-default-prediction/sample_submission.csv'

In [None]:
##load and prepare data
train = data_prep(path_data = train_path, path_target = labels_path)[0]

## XGBoost model

In [None]:
##get train DMatrix
#separate X and y
X = train.iloc[:,1:-1]
y = train.iloc[:,-1]

# CLEAN RAM
del train #??? is it useful?

In [None]:
##split data on training and testing
#following this tutorial (from: https://www.datacamp.com/tutorial/xgboost-in-python)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED)


In [None]:
##create DMatrices
train_dmatrix = xgb.DMatrix(data = X_train, label = y_train) #will need to move this AFTER the train test split, create two DMatrices, test and train

del X_train, y_train #again, is it useful?

In [None]:
%%time
##fit model
parameters = {'max_depth':6, #integer
              'eta':0.7, #[0,1]
              'min_child_weight':20, #integer
              'subsample':0.9, #[0,1]
              'colsample_bytree':0.6, #[0,1]
              'colsample_bylevel':0.6, #[0,1]
              'colsample_bynode':0.6, #[0,1]
              'lambda': 5, #integer
              'alpha': 5, #integer
              'objective':'binary:logistic'}
num_round = 10
model = xgb.train(parameters, train_dmatrix, num_round)


### Tune the model
To do next: 
- explore the possibility of xgb.cv cross validation
- most important: number of boosting rounds, number of trees
- select 10-15 randomly specified parameter settings, for each find optimal n-rounds (or n_trees)
- parameters that are important: regarding tree size (leaves, branches), eta, max_depth, min_child_weight; sampling: subsample, family of parameters for subsampling of columns (colsample_bytree, etc..), regularization parameters: alpha and lambda

### Random sets of parameters

In [None]:
##create list with dictionaries of parameters
#each parameter is generated randomly within prespecified interval, the interval size was tested by hand
par_list = []
seed_list = np.arange(0, 150, 10, dtype=int)

for seed in seed_list:
    rnd.seed(seed)
    par_dict = {'max_depth':rnd.randint(2,10), #integer
              'eta':rnd.uniform(0,1), #[0,1]
              'min_child_weight':rnd.randint(1,25), #integer
              'subsample':rnd.uniform(0.5,1), #[0,1]
              'colsample_bytree':rnd.uniform(0.5,1), #[0,1]
              'colsample_bylevel':rnd.uniform(0.5,1), #[0,1]
              'colsample_bynode':rnd.uniform(0.5,1), #[0,1]
              'lambda': rnd.randint(1,5), #integer
              'alpha': rnd.randint(1,5), #integer
              'objective':'binary:logistic'}
    par_list.append(par_dict)

In [None]:
par_list[0]

### Cross Validation

In [None]:
##Building custom metric function, based on amex_metric()
def amex_metric_cv(predt: np.ndarray, dtrain: xgb.DMatrix):
    #getting true values
    y = dtrain.get_label()
    df_y_test = pd.DataFrame(y)
    df_y_test = df_y_test.rename(columns={df_y_test.columns[0]: 'target'})
    
    #getting predictions
    df_test_preds = pd.DataFrame(predt)
    df_test_preds = df_test_preds.rename(columns={df_test_preds.columns[0]: 'prediction'})
    
    score = amex_metric(df_y_test, df_test_preds)  #amex metric function accepts two dataframes with columns named 'target' and 'prediction'
                                                   #so they need to be created from imput of custom metric function
    
    return 'AMEX_score', float(score)
    

#### Cross-Val and Random parameters -- automated

In [None]:
%%time
##find optimal number of boosting rounds for each set of random parameters, by cross validation
optimal_nrounds = []

for parm in par_list:
    
    cv_df = xgb.cv(parm, 
                   train_dmatrix, 
                   num_boost_round=500, 
                   nfold=3, 
                   seed = SEED, 
                   custom_metric=amex_metric_cv, 
                   verbose_eval=True,
                   maximize = True,
                   early_stopping_rounds = 50)
    
    optimal_nrounds.append((cv_df.shape[0], parm))
    

In [None]:
%%time
##estimate model for each set of parameters and their optimal n_round
models_ls = []

for tup in optimal_nrounds:
    model = xgb.train(tup[1], train_dmatrix, tup[0])
    
    models_ls.append(model)

In [None]:
%%time
##get the scores of models
scores_ls = []
test_dmatrix  = xgb.DMatrix(data = X_test)

for model in models_ls:
    #predict with testing sample
    test_preds = model.predict(test_dmatrix)
    
    ##tweak saved predictions to be compatible with scoring function
    df_y_test = pd.DataFrame(y_test)
    df_test_preds = pd.DataFrame(test_preds)
    df_test_preds.index = df_y_test.index
    df_test_preds = df_test_preds.rename(columns={df_test_preds.columns[0]: 'prediction'})

    ##compute score (we want to maximize it)
    score = amex_metric(df_y_test, df_test_preds)
    scores_ls.append(score)


In [None]:
##finding the best model
selected_model = models_ls[scores_ls.index(max(scores_ls))]

### Tweak eta of the selected model

In [None]:
%%time
##save the config selected by cross-validation
selected_config = json.loads(selected_model.save_config())
chosen_param = selected_config['learner']['gradient_booster']['updater']['grow_colmaker']['train_param']
chosen_n_rounds = int(selected_config['learner']['gradient_booster']['gbtree_model_param']['num_trees'])

##loop through etas to find the best one
etas_list = np.arange(0.01, 1, 0.02) #list of etas to loop through
opt_models_etas = [] #empty lists for models and their scores
etas_scores = []

for eta in etas_list:
    #substitute iteration-specific eta to parameters
    iter_parms = copy.copy(chosen_param)
    iter_parms['eta'] = eta
    iter_parms['learning_rate'] = eta

    #train model
    model = xgb.train(iter_parms, train_dmatrix, chosen_n_rounds)  
    opt_models_etas.append(model)
    
    #make predictions with model with iteration-specific eta and save score
    test_preds = model.predict(test_dmatrix) #predict with testing sample
    
    #tweak saved predictions to be compatible with scoring function
    df_y_test = pd.DataFrame(y_test)
    df_test_preds = pd.DataFrame(test_preds)
    df_test_preds.index = df_y_test.index
    df_test_preds = df_test_preds.rename(columns={df_test_preds.columns[0]: 'prediction'})

    #compute score (we want to maximize it)
    score = amex_metric(df_y_test, df_test_preds)
    etas_scores.append(score)
    print(f'Eta: {eta}, Score: {score}')

In [None]:
#save the selected_model with new eta
selected_model = opt_models_etas[etas_scores.index(max(etas_scores))]

## Predict the result

In [None]:
##get test DMatrix
test = data_prep(path_data = test_path, path_target = sample_sub_path)
id_map = test[1]
test = test[0]

#get only explanatory variables
X_pred = test.iloc[:,1:-1]

#create DMatrix
pred_dmatrix = xgb.DMatrix(data = X_pred)

# CLEAN RAM
del test

In [None]:
##make predictions
preds = selected_model.predict(pred_dmatrix)

In [None]:
df_preds = pd.DataFrame(preds)
df_preds.head()
##remains to be done: match predictions with index

In [None]:
##switching the key and value in id_map
id_map_rev = dict([(value, key) for key, value in id_map.items()])

##creating dataframe to be saved in .csv
predictions = pd.DataFrame(data = df_preds.index.map(id_map_rev), columns=['customer_ID'])
predictions['prediction'] = df_preds

In [None]:
predictions.head()

In [None]:
##save csv
predictions.to_csv('submission.csv',index=False)

---
old code

### Get the score

In [None]:
%%script false --no-raise-error
##predict with testing sample
test_dmatrix  = xgb.DMatrix(data = X_test)
#here I could del X_test if needed

test_preds = model.predict(test_dmatrix)

In [None]:
%%script false --no-raise-error
##tweak saved predictions to be compatible with scoring function
df_y_test = pd.DataFrame(y_test)
df_test_preds = pd.DataFrame(test_preds)
df_test_preds.index = df_y_test.index
df_test_preds = df_test_preds.rename(columns={df_test_preds.columns[0]: 'prediction'})

##compute score (we want to maximize it)
amex_metric(df_y_test, df_test_preds)

In [None]:
%%script false --no-raise-error
##get test DMatrix
test = data_prep(path_data = test_path, path_target = sample_sub_path)

#get only explanatory variables
X_pred = test.iloc[:,1:-1]

#create DMatrix
pred_dmatrix = xgb.DMatrix(data = X_pred)

# CLEAN RAM
del test

In [None]:
%%script false --no-raise-error
##make predictions
preds = model.predict(pred_dmatrix)

In [None]:
%%script false --no-raise-error
preds

#quick inspection of predictions
df_preds = pd.DataFrame(preds)
df_preds.describe()


In [None]:
%%script false --no-raise-error
#load data
train = pd.read_parquet('/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet').set_index('customer_ID')

#create a mapping dictionary {cust_id: integer}
cust_id_map = {}
for (i, j) in list(enumerate(train.index.unique())):
    cust_id_map.update({j: i}) # getting dictionary {original ID: new ID}
    
#setting integer ID as index
train.index = train.index.map(cust_id_map)    

#converting do datetime
train.S_2 = pd.to_datetime(train.S_2)
print(train.shape)

In [None]:
%%script false --no-raise-error
train.head()

In [None]:
%%script false --no-raise-error
#merge with target
labels = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv').set_index('customer_ID')
labels.index = labels.index.map(cust_id_map) #again, map index to integers
train = train.merge(labels, left_index=True, right_index=True) #merge the target column to train df

In [None]:
%%script false --no-raise-error
##taken from documentation snippet: https://xgboost.readthedocs.io/en/stable/get_started.html
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
model = xgb.train(param, train_dmatrix, num_round)
# make prediction
#preds = bst.predict(dtest)

In [None]:
%%script false --no-raise-error
#Keeping only the latest observation for each customer (row with max date)
idx = train.groupby(['customer_ID'])['S_2'].transform(max) == train['S_2']
train = train[idx]
print(train.shape)
train.head()

In [None]:
%%script false --no-raise-error
%%time
cv_result = xgb.cv(par_list[2], 
                   train_dmatrix, 
                   num_boost_round=10, 
                   nfold=3, 
                   seed = SEED, 
                   custom_metric=amex_metric_cv, 
                   verbose_eval=True,
                   maximize = True,
                   early_stopping_rounds = 5)

print('Best number of trees = {}'.format(cv_result.shape[0]))
#cv_result.shape[0]