In [1]:
import numpy as np
#import scipy
import pandas as pd
import gc
import os, psutil
import pickle

from sklearn.base import clone
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

import joblib

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
pd.set_option('mode.chained_assignment', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_dir =  'kaggle/input/amex-default-prediction/'
filename = base_dir + 'train_data.csv'
filename_label = base_dir + 'train_labels.csv'
filename_test = base_dir + 'test_data.csv'

model_save_path = 'model\\'

random_state = 42

## Functions for amex comp 

In [3]:
def memory_usage(metric='MB', places=1,print_out=True):
    metric_mapping = {'B':0, 'KB': 2, 'MB': 2, 'GB': 3}
    multiplier= metric_mapping[metric]
    
    mem_used = psutil.Process(os.getpid()).memory_info().rss / 1024 ** multiplier
    mem_used_total = psutil.virtual_memory()[3] /1024**multiplier
    if print_out:
        print(f'Memory used Process: {mem_used:.{places}F}{metric} Memory used Total: {mem_used_total:.{places}F}{metric}')

    return mem_used
memory_usage("GB",1)

Memory used Process: 0.2GB Memory used Total: 18.3GB


0.18233108520507812

In [4]:
def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:
    #from here https://www.kaggle.com/code/rohanrao/amex-competition-metric-implementations
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def eval_model_classification_report(x_test, y_test, models):
    actual = np.array(y_test).squeeze()
    
    if isinstance(models, list):
        preds = np.zeros(len(x_test))
        for model in models: 
            preds += model.predict_proba(x_test)[:,1]
        #weight each one equally
        preds /= len(models)
        y_preds = (preds >0.5).astype(np.int8)
        
        
    else:
        preds = models.predict_proba(x_test)[:,1]
        y_pred = models.predict(x_test)
    
    amex_score = amex_metric_numpy(actual,preds)

    
    print(f'Validation Results - Amex metric: {amex_score:.3f} \n\nClassification Report\n')
    print(classification_report(y_test, y_pred, target_names=['No Default','Default']))
    print('\n\nConfusion Matrix\n')
#     print(confusion_matrix(y_test, y_pred))
    cmtx = pd.DataFrame(
        confusion_matrix(y_test, y_pred), 
        index=['true:yes', 'true:no'], 
        columns=['pred:yes', 'pred:no']
    )
    print(cmtx)

    

## Helper class from prep notebook for scoring

In [24]:
#custom aggregate functions
def lm_diff(series):
    if len(series)>1:
        return series.iloc[-1] - series.iloc[-2]
    else:
        return 0

def squared_mean(series):
    return (series**2).mean()

def missing_values(series):
    return series.isna().sum()

def missing_last_value(series):
    return series.isna().sum()

def missing_last_value(series):
    return series.isna().iloc[-1].astype(int)

class amex_helper():
      
    def __init__(self, chunksize= 500000):
        self.previous_chunk_data= None
        self.chunksize = chunksize
        self.key_columns = ['customer_ID','S_2']
        self.categorical_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
        self.non_numeric_columns = self.key_columns + self.categorical_columns
        
        self.numeric_dtype = np.float32
        
        #Values gotten from for converting to inshttps://www.kaggle.com/code/raddar/amex-data-int-types-train/notebook
        self.int_scaling = [
            ('B_4',0,78), ('B_16',0,12), ('B_19',0,1), ('B_20',0,17), ('B_22',0,2), ('B_31',0,1), ('B_32',0,1), ('B_33',0,1), ('B_41',0,1), 
            ('D_39',0,34), ('D_44',0,8), ('D_49',0,71), ('D_51',0,3), ('D_59',48/5,48), ('D_65',0,38), ('D_70',0,4), 
            ('D_72',0,3), ('D_74',0,14), ('D_75',0,15), ('D_78',0,2), ('D_79',0,2), ('D_80',0,5), ('D_81',0,1), ('D_82',0,2), 
            ('D_83',0,1), ('D_84',0,2), ('D_86',0,1), ('D_87',0,1), ('D_89',0,9), ('D_91',0,2), ('D_92',0,1), ('D_93',0,1), 
            ('D_94',0,1), ('D_96',0,1), ('D_103',0,1), ('D_106',0,23), ('D_107',0,3), ('D_108',0,1), ('D_109',0,1),('D_111',0,2),
            ('D_113',0,5), ('D_122',0,7), ('D_123',0,1), ('D_124',22,22), ('D_125',0,1), ('D_127',0,1), ('D_129',0,1), ('D_135',0,1),
            ('D_136',0,4), ('D_137',0,1), ('D_138',0,2), ('D_139',0,1), ('D_140',0,1), ('D_143',0,1), ('D_145',0,11), ('R_2',0,1), 
            ('R_3',0,10), ('R_4',0,1), ('R_5',0,2),('R_8',0,1), ('R_9',0,6), ('R_10',0,1),('R_11',0,2), ('R_13',0,31), 
            ('R_15',0,1), ('R_16',0,2), ('R_17',0,35), ('R_18',0,31),('R_19',0,1), ('R_20',0,1), ('R_21',0,1),('R_22',0,1), 
            ('R_23',0,1), ('R_24',0,1), ('R_25',0,1), ('R_26',0,28), ('R_28',0,1), ('S_6',0,1), ('S_11',5,25), ('S_15',10/3,10),
            ('S_18',0,1), ('S_20',0,1),
            ('S_8',0,100), ('S_13',0,100) # these have overlap and should be treated differently
              ]
    
    def read_data(self, filename, update_columns=True, first_chunk_only=False, raw=False):

        print(f'Reading Data ')
        if update_columns:
            #read all columns names and update lists
            self.all_columns = pd.read_csv(filename, index_col=False, nrows=0).columns.tolist()
            self.numeric_columns = [x for x in self.all_columns if x not in self.non_numeric_columns]
            #Types of features
            # self.delinquency_features = [col for col in self.all_columns if col.startswith('D_')]
            # self.spend_features = [col for col in self.all_columns if col.startswith('S_')]
            # self.payment_features = [col for col in self.all_columns if col.startswith('P_')]
            # self.balance_features = [col for col in self.all_columns if col.startswith('B_')]
            # self.risk_features = [col for col in self.all_columns if col.startswith('R_')]
            
            print(f'Calculating data sets aggregate statistics')
            aggregate_final = None
            final_column_na = None
            final_count_values = []
            drop_cuttoff = 0.35
            
            with pd.read_csv(filename, chunksize=400000) as reader: #self.chunksize
                for i, chunk in enumerate(reader):
                    
                    #calculate chunk aggregates
                    aggregate = chunk[self.numeric_columns].agg(['mean', 'count', 'min', 'max',squared_mean]).transpose()
                    #coutn number of na in chunk
                    chunk_column_na = chunk.agg(['count',missing_values]).transpose() 
                    #get count of each value in categorical columns
                    chunk_count_values = dict()
                    for column in self.categorical_columns:
                        chunk_count_values[column] = chunk[column].value_counts().to_dict()
                    final_count_values.append(pd.DataFrame(chunk_count_values))
                    
                    del chunk
                    gc.collect()
                    
                    #update aggregates
                    if aggregate_final is not None:
                        aggregate_final['mean'] = (aggregate_final['mean']*aggregate_final['count']  + aggregate['mean']*aggregate['count'] ) / ( aggregate_final['count'] + aggregate['count'] )
                        aggregate_final['squared_mean'] = (aggregate_final['squared_mean']*aggregate_final['count']  + aggregate['squared_mean']*aggregate['count'] ) / ( aggregate_final['count'] + aggregate['count'] )
                        aggregate_final['count'] = aggregate_final['count'] + aggregate['count']
                        aggregate_final['min'] = pd.concat([aggregate_final['min'],  aggregate['min']], axis=1).min(axis=1)
                        aggregate_final['max'] = pd.concat([aggregate_final['max'],  aggregate['max']], axis=1).max(axis=1)
                    else:
                        aggregate_final = aggregate
                        
                    #update NA values
                    if final_column_na is not None:
                        final_column_na['count'] = final_column_na['count'] + chunk_column_na['count']
                        final_column_na['missing_values'] = final_column_na['missing_values'] + chunk_column_na['missing_values']
                        break
                    else:
                        final_column_na = chunk_column_na
                        
            #Na Values final 
            final_column_na['percent_missing'] = final_column_na['missing_values'] / (final_column_na['missing_values'] +final_column_na['count'])
            self.columns_drop = list(final_column_na[final_column_na['percent_missing']> drop_cuttoff].reset_index()['index'])
            #Add redundant features to drop list 
            self.columns_drop = list(set(self.columns_drop + ['D_103','D_139'])) #tied to D_107 and D_145 respectively
            
            #update meta data list for columns to remove columns that will be dropped
            self.categorical_columns = list(set(self.categorical_columns) - set(self.columns_drop ) )
            self.non_numeric_columns = list(set(self.non_numeric_columns) - set(self.columns_drop))
            self.all_columns = list(set(self.all_columns) - set(self.columns_drop))
            self.numeric_columns = list(set(self.numeric_columns ) - set(self.columns_drop))
            self.int_scaling = [x for x in self.int_scaling if x[0] not in self.columns_drop]
            
            #aggregates values final
            aggregate_final['var'] = aggregate_final['squared_mean'] - aggregate_final['mean']**2      
            aggregate_final['std'] = aggregate_final['var']**(1/2)
            aggregate_final = aggregate_final[~aggregate_final.index.isin(self.columns_drop)] #remove columns that will be dropped
            self.aggregate_stats  = aggregate_final
                       
            #merge all value counts
            final_count_values = pd.concat(final_count_values).groupby(level=0).sum()
            #remove columns that will be dropped
            final_count_values = final_count_values[~final_count_values.index.isin(self.columns_drop)]
            self.categorical_mode = final_count_values.idxmax().to_dict()
            #get dict of values to impute categorical values
            categorical_values = dict()
            for column in final_count_values.columns:
                mapping_dict = final_count_values[final_count_values[column] != 0].reset_index()['index'].to_dict()
                categorical_values[column] = { mapping_dict[x]:x for x in mapping_dict}
            self.categorical_encode = categorical_values
        print(f'Reading {filename} in chunks')
        if raw:
            return pd.read_csv(filename)
        else:
            output = []
            with pd.read_csv(filename, chunksize=self.chunksize) as reader:
                for i, chunk in enumerate(reader):
                    print(f'Reading chunk: {i+1}')
                    output.append(self.process_chunk(chunk))
                    memory_usage("GB",1)
                    
                    if first_chunk_only:
                        break
                    gc.collect()
                    
            print(f'finished reading all chunks')
            gc.collect()
            self.previous_chunk_data = None
            print(f'combining all chunks')
            data = pd.concat(output, copy=False).sort_index().reset_index(drop=True)
            print(f'minimising data types')
            self.print_memory_usage(data, label='before')
            self.update_dtypes = self.compress(data)
            data = data.astype(self.update_dtypes)
            self.print_memory_usage(data, label='compressed')
            
            return data



    def process_chunk(self, chunk):

        ##Pre Processing of dataframe chunk to make sure customer ID is all processed at same time##

        #Test to see if it is not last chunk and take last id from the DF incase it is over 2 chunks
        if len(chunk) >= self.chunksize:
            last_id_in_chunk = chunk['customer_ID'].iloc[-1]
            last_id_in_chunk_data = chunk[chunk['customer_ID']==last_id_in_chunk].copy()
            chunk = chunk.loc[chunk['customer_ID']!=last_id_in_chunk]
        else: 
            last_id_in_chunk_data = None

        #Check if any previous chunk data
        if self.previous_chunk_data is not None: 
            chunk = pd.concat([self.previous_chunk_data,chunk])
        

        self.previous_chunk_data = last_id_in_chunk_data
        
        #calculate NA aggregates
        x_na_aggregate = chunk.groupby("customer_ID")[self.numeric_columns].agg([missing_values, missing_last_value])
        x_na_aggregate.columns = ['_'.join(x) for x in x_na_aggregate.columns]

        #Drop columns with too many na
        chunk.drop(self.columns_drop, axis=1, inplace=True)
        
        #fill NA
        numeric_means = self.aggregate_stats['mean'].to_dict()
        numeric_means = { x:numeric_means[x] for x in numeric_means if  x.startswith('R_') or x.startswith('D_')}
        print('filling na')
        chunk.fillna(numeric_means, inplace=True)
        chunk.fillna(self.categorical_mode, inplace=True)
        print('filling 0')
        chunk.fillna(0, inplace=True) #fill remaining with 0

        #encode categorical values
        chunk.replace(self.categorical_encode, inplace=True)
        
        #set numeric datatype
        update_dtypes_numeric = {x: self.numeric_dtype for x in self.numeric_columns}
        chunk = chunk.astype(update_dtypes_numeric)
        
        update_dtypes_categorical = {x: np.int16 for x in self.categorical_columns}
        chunk = chunk.astype(update_dtypes_categorical)
                
        #clip outliers to max/min values
        n_deviations_limit = 10
        upper_limit = list(self.aggregate_stats['mean'] + self.aggregate_stats['std'] * n_deviations_limit)
        lower_limit = list(self.aggregate_stats['mean'] - self.aggregate_stats['std'] * n_deviations_limit)
        chunk[self.numeric_columns] = chunk[self.numeric_columns].clip(lower_limit,upper_limit, axis=1)
        
        #Remove noise from float columns ie turn float -> int
        for column_conversion in self.int_scaling: #column_conversion -> (column_name, add_value, multiply_value)
            chunk[column_conversion[0]] = ((chunk[column_conversion[0]] +  column_conversion[1])*column_conversion[2]).round(0).astype(np.int16)
#             if column_conversion[0] =='S_13':
#                 print(chunk[column_conversion[0]].unique())

        
        #create aggregates for each customer_id numeric columns
        print('Doing Aggregates')
        x_aggregate = chunk.groupby("customer_ID")[self.numeric_columns].agg(['first', 'mean', 'std', 'min', 'max', 'last', lm_diff])
        x_aggregate.columns = ['_'.join(x) for x in x_aggregate.columns]
        #fill std columns with 0 incase of only one value as it results in nan
        std_fill_na_columns = {x:0 for x in x_aggregate.columns if '_std' in x}
        x_aggregate.fillna(std_fill_na_columns, inplace=True)
        
        #create aggregates for each customer_id categorical columns
        x_aggregate_category = chunk.groupby("customer_ID")[self.categorical_columns].agg(['first', 'last']) #,'nunique''count',  removed because dont seem to add much value
        x_aggregate_category.columns = ['_'.join(x) for x in x_aggregate_category.columns]
        
        #Get the number of months a record has been active
        chunk['S_2'] = pd.to_datetime(chunk['S_2'])
        x_date = chunk.groupby("customer_ID")['S_2'].agg(['first','last'])
        x_aggregate['months'] = (x_date['last'].dt.year  - x_date['first'].dt.year)*12 + (x_date['last'].dt.month  - x_date['first'].dt.month)+1
        
        del chunk, x_date
        gc.collect()

        #Features for how metrics have changed over time 
        for column in  x_aggregate:
            if 'first' in column:
                column_first = column
                column = column.replace('_first', '')
                column_last = column +'_last'

                x_aggregate[column+'_change_sub'] = (x_aggregate[column_first] - x_aggregate[column_last])/ x_aggregate['months']
                x_aggregate[column+'_change_div'] = ((x_aggregate[column_first] / x_aggregate[column_last])/ x_aggregate['months']).replace([np.inf, -np.inf], np.nan).fillna(0)
        
        
        # #create ratio features between payment and spending
        # payment_features = [col.replace('_first', '') for col in x_aggregate if col.startswith('P_') and col.endswith('_first')]
        # spend_features = [col.replace('_first', '') for col in x_aggregate if col.startswith('S_') and col.endswith('_first')]
        
        # column_suffix = ['_mean', '_last']
        # for p_column in payment_features:
        #     for s_column in spend_features:
        #         column_name = f'{p_column}_{s_column}'
        #         for suffix in column_suffix:
        #             x_aggregate[column_name + '_ratio' + suffix] = (x_aggregate[p_column+suffix] / x_aggregate[s_column+suffix]).replace([np.inf, -np.inf], np.nan).fillna(0)
        #             x_aggregate[column_name + '_sub' + suffix] = (x_aggregate[p_column+suffix] - x_aggregate[s_column+suffix])
                

        #create combination features
        features = ['B_3','B_1','B_37','B_9','B_2','B_7','B_18','D_48','D_44','D_39','P_2','P_3','P_4','R_1','R_2','R_3','S_3','S_23','S_7']
        column_suffix = '_last'
        for p_column in features:
            #print(p_column)
            for s_column in features:
                column_name = f'{p_column}_{s_column}'
                x_aggregate[column_name + '_multi' + column_suffix] = (x_aggregate[p_column+column_suffix] * x_aggregate[s_column+column_suffix])
                if p_column != s_column:
                    x_aggregate[column_name + '_ratio' + column_suffix] = (x_aggregate[p_column+column_suffix] / x_aggregate[s_column+column_suffix]).replace([np.inf, -np.inf], np.nan).fillna(0)
                    x_aggregate[column_name + '_sub' + column_suffix] = (x_aggregate[p_column+column_suffix] - x_aggregate[s_column+column_suffix])


        update_cols = {x: np.float32 for x in x_aggregate.select_dtypes(include=[float]).columns}
        x_aggregate = x_aggregate.astype(update_cols)
        
        #feature for has the category change over time
        for column in  x_aggregate_category:
            if 'first' in column:
                column_first = column
                column = column.replace('_first', '')
                column_last = column +'_last'

                x_aggregate_category[column+'_change'] = (x_aggregate_category[column_first] == x_aggregate_category[column_last]).astype(np.int16)
                
        chunk = pd.concat([x_aggregate, x_aggregate_category, x_na_aggregate], axis=1)
        
        #drop first aggregate
        columns_drop = [x for x in chunk.columns if '_first' in x]
        chunk.drop(columns_drop, axis=1, inplace=True)

        return chunk
    
    def print_memory_usage(self, data, metric='MB', label=''):
        metric_mapping = {'B':0, 'KB': 2, 'MB': 2, 'GB': 3}
        multiplier= metric_mapping[metric]
        memory = data.memory_usage().sum() / (1024**multiplier)
        print(f'Memory usage {label}: {memory:.2f}{metric}')

    
    def compress(self, data):
        INT8_MIN    = np.iinfo(np.int8).min
        INT8_MAX    = np.iinfo(np.int8).max
        INT16_MIN   = np.iinfo(np.int16).min
        INT16_MAX   = np.iinfo(np.int16).max
        INT32_MIN   = np.iinfo(np.int32).min
        INT32_MAX   = np.iinfo(np.int32).max

        FLOAT16_MIN = np.finfo(np.float16).min
        FLOAT16_MAX = np.finfo(np.float16).max
        FLOAT32_MIN = np.finfo(np.float32).min
        FLOAT32_MAX = np.finfo(np.float32).max
        column_dtypes = {}
        for col in data.columns:
            col_dtype = data[col][:100].dtype

            if col_dtype != 'object':
                col_series = data[col]
                col_min = col_series.min()
                col_max = col_series.max()

                if col_dtype == 'float64':
#                     if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
#                         column_dtypes[col] = np.float16
#                     elif
                    if (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                        column_dtypes[col] = np.float32
                    else:
                        pass


                if col_dtype == 'int64':
                    if (col_min > INT8_MIN/2) and (col_max < INT8_MAX/2):
                        column_dtypes[col] = np.int8
                    elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                        column_dtypes[col] = np.int16
                    elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                        column_dtypes[col] = np.int32
                    else:
                        pass
        return column_dtypes
    
    


## Import Data in batches

In [6]:
# x_input = a_amex_helper.read_data(filename, first_chunk_only=True)
# y_input = pd.read_csv(filename_label)
# y_input = y_input.loc[x_input.index, :]
# gc.collect()

In [25]:
with open('helper_object.pickle', 'rb') as handle:
    a_amex_helper = pickle.load(handle)

In [8]:
memory_usage("GB",1)
x_input = pd.read_parquet('x_input.parquet')
y_input = pd.read_parquet('y_input.parquet')
y_input = y_input['target']
gc.collect()
memory_usage("GB",1)

Memory used Process: 0.2GB Memory used Total: 18.5GB
Memory used Process: 8.0GB Memory used Total: 26.3GB


8.025642395019531

## Modelling Functions

In [9]:
def custom_metric(actual, preds):
    return 1 - amex_metric_numpy(actual,preds)

def custom_metric_lgbm(actual, preds):
    return 'amex_metric', amex_metric_numpy(actual,preds), True

def save_models(models, name):
    for i, model in enumerate(models):
        print(f'saving model_{name}_{i}')
        joblib.dump(model, model_save_path+f'model_{name}_fold_{i}')

def kfold_training(model, x_input, y_input, print_messages=True, fit_params ={},  splits= 5, first_fold_only=False):
    skf = StratifiedKFold(n_splits=splits, random_state=random_state, shuffle=True)
    models = []
    scores = []
    if print_messages:
        print('starting cross validation training')
    for fold, (train_index, test_index)in enumerate(skf.split(x_input, y_input)):
        if print_messages:
            print('*'*100)
            print(' '*20 + f'Fold: {fold}' )
            print('*'*100)
        x_train, y_train = x_input.iloc[train_index], y_input.iloc[train_index]
        x_test, y_test = x_input.iloc[test_index], y_input.iloc[test_index]
        
        
        model_iter = clone(model)
        model_iter.fit(x_train, y_train, eval_set=[(x_test, y_test)], **fit_params)
        preds = model_iter.predict_proba(x_test)[:,1]

        score = amex_metric_numpy(y_test.to_numpy(), preds)
        scores.append(score)
        models.append(model_iter)
        joblib.dump(model_iter, model_save_path+f'model_temp_training_fold_{fold}')
        
        if print_messages:
            print(f'fold: {fold}')
            eval_model_classification_report( x_test, y_test, model_iter)
            print('\n'*3)

        
        if first_fold_only:
            break
    
    overall_score = np.array(scores).mean()
    
    
    if print_messages:
        print(f'Cross Validation score: {overall_score}')
    
    
    
    return overall_score, models

def get_feature_importance_lgbm(model):
    importance_df = (
        pd.DataFrame({
            'feature_name': model.booster_.feature_name(),
            'importance_gain': model.booster_.feature_importance(importance_type='gain'),
            'importance_split': model.booster_.feature_importance(importance_type='split'),
        })
        .sort_values('importance_gain', ascending=False)
        .reset_index(drop=True)
    )
    return importance_df

#for testing kfold training
# model = XGBClassifier()
# fit_params ={'verbose':False}
# kfold_training(model,x_input.iloc[:1000],y_input.iloc[:1000], fit_params=fit_params)

## **XGB Modeling**

In [10]:


# def objective_tune(trial: Trial, x_train, y_train, x_test, y_test) -> float:

#     param = {
#                 "n_estimators" : trial.suggest_int('n_estimators', 100, 2600, log=True),
#                 'tree_method': 'gpu_hist',
#                 'max_delta_step' : trial.suggest_int('max_delta_step', 1, 10),
#                 'max_depth':trial.suggest_int('max_depth', 3, 10),
#                 'min_child_weight':trial.suggest_int('min_child_weight', 0, 5),
#                 'gamma':trial.suggest_discrete_uniform('gamma', 0, 5, 0.5),
#                 'learning_rate':trial.suggest_loguniform('learning_rate',0.05,0.3),
#                 'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5,0.9,.1),
#                 'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
#                 'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
#                 'eta':trial.suggest_loguniform('eta', 1e-8, 1.0),
#                 'n_jobs' : -1,
#                 'eval_metric' : custom_metric,
#                 'objective' : 'binary:logistic',
#                 'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 5),
#                 'random_state': 2022
                 
#             }
#     model = XGBClassifier(**param)

#     model.fit(x_train,y_train)
    
#     preds = model.predict_proba(x_test)[:,1]
#     actual = np.array(y_test).squeeze()
    
#     return a_amex_helper.amex_metric_numpy(actual,preds)


# print('Parameter Tuning is starting now...')
# study = optuna.create_study(direction='maximize',sampler=TPESampler(),)
# study.optimize(lambda trial : objective_tune(trial,x_train, y_train, x_test, y_test),n_trials= 50)
# print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [11]:
fixed_params = {
             #'n_jobs' : -1,
             #'objective':'aucpr',
             #'eval_metric' : 'aucpr',
             'eval_metric' : custom_metric,
             'early_stopping_rounds' : 500,
             'objective' : 'binary:logistic',
            'random_state': random_state,
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            }

search_params = {
    'n_estimators': 4000, 
    'max_delta_step': 4, 
    'max_depth': 4, 
    'min_child_weight': 3, 
    'gamma': 3.0, 
    'learning_rate': 0.059, 
    'colsample_bytree': 0.5, 
    'lambda': 0.332, 
    'alpha': 0.018, 
    'scale_pos_weight': 1}

# params = {
#         'max_depth': 7,
#         'eta': 0.03,
#         'subsample': 0.88,
#         'colsample_bytree': 0.5,
#         'gamma': 1.5,
#         'min_child_weight': 8,
#         'lambda': 70,
#     }
#model = XGBClassifier(**fixed_params, **search_params)


# model.fit(x_train, y_train, eval_set=eval_set, verbose=True)
# eval_model_classification_report( x_test, y_test, model)
# joblib.dump(model, model_save_path+'xgb')


# model = XGBClassifier(**fixed_params, **search_params)
# fit_params ={'verbose':False}
# score, models = kfold_training(model,x_input,y_input, fit_params=fit_params)
# save_models(models, 'xgb')

In [12]:
# model = XGBClassifier(**fixed_params)
# fit_params ={'verbose':False}
# score, models = kfold_training(model,x_input,y_input, fit_params=fit_params)


In [13]:
# frames = []
# for model in models:
#     gain = model.get_booster().get_score(importance_type='gain')
#     gain = pd.DataFrame.from_dict(gain, orient='index', columns=['gain'])
#     frames.append(gain)
# importance = pd.concat(frames)
# importance = importance.groupby(level=0).sum()/len(models)
# importance = importance.sort_values('gain',ascending=False)
# #importance.agg(['min','max','mean','std'])
# mean = importance.mean()[0]
# print(mean)
# model_select = importance[importance['gain']>mean]
# top_features= list(model_select.reset_index()['index'])

In [14]:
# model = XGBClassifier(**fixed_params, **search_params)
# fit_params ={'verbose':False}
# score, models = kfold_training(model, x_input[top_features], y_input, fit_params=fit_params)
# save_models(models, 'xgb')

In [15]:
#Dart Testing
# param = {
#     'booster': 'dart',
#     'tree_method': 'gpu_hist',
#     'n_estimators': 1056,
#     'max_depth': 5, 
#     'learning_rate': 0.1,
#     'objective': 'binary:logistic',
#     'sample_type': 'uniform',
#     'normalize_type': 'tree',
#     'rate_drop': 0.1,
#     'skip_drop': 0.5}

# fit_params ={'verbose':False}
# model = XGBClassifier(**param)
# score, models = kfold_training(model,x_input,y_input, fit_params=fit_params)
#eval_model_classification_report(x_input,y_input, models)

## LGB Modelling

In [16]:
model = LGBMClassifier(n_estimators = 500)
model.fit(x_input.iloc[:-100000],y_input.iloc[:-100000])
eval_model_classification_report( x_input.iloc[-99999:], y_input.iloc[-99999:], model)

Validation Results - Amex metric: 0.786 

Classification Report

              precision    recall  f1-score   support

  No Default       0.93      0.93      0.93     74282
     Default       0.81      0.81      0.81     25717

    accuracy                           0.90     99999
   macro avg       0.87      0.87      0.87     99999
weighted avg       0.90      0.90      0.90     99999



Confusion Matrix

          pred:yes  pred:no
true:yes     69430     4852
true:no       4962    20755


In [17]:

# compute importances
def get_feature_importance_lgbm(model):
    importance_df = (
        pd.DataFrame({
            'feature_name': model.booster_.feature_name(),
            'importance_gain': model.booster_.feature_importance(importance_type='gain'),
            'importance_split': model.booster_.feature_importance(importance_type='split'),
        })
        .sort_values('importance_gain', ascending=False)
        .reset_index(drop=True)
    )
    return importance_df


def get_feature_importance_lgbm_models(models):
    feature_importance = []
    for i, model in enumerate(models):
        feature_importance_fold = get_feature_importance_lgbm(model)
        feature_importance_fold.to_csv(f'Tableau\\feature_importance_{i}.csv')
        feature_importance.append(feature_importance_fold)
    feature_importance = pd.concat(feature_importance)
    feature_importance = feature_importance.groupby('feature_name').sum().sort_values('importance_gain', ascending=False)
    feature_importance['folds'] = len(models)
    feature_importance['max_gain'] = feature_importance['importance_gain'].max()
    feature_importance['total_gain'] = feature_importance['importance_gain'].sum()
    feature_importance['percent_of_max'] = feature_importance['importance_gain'] / feature_importance['max_gain']
    feature_importance['percent_of_total'] = feature_importance['importance_gain'] / feature_importance['total_gain']
    feature_importance['percent_of_total_cumulative'] = feature_importance['percent_of_total'].cumsum()
    
    return feature_importance

In [18]:
models_single = [model]
feature_importance = get_feature_importance_lgbm_models(models_single)
feature_importance

Unnamed: 0_level_0,importance_gain,importance_split,folds,max_gain,total_gain,percent_of_max,percent_of_total,percent_of_total_cumulative
feature_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
B_1_D_44_multi_last,269946.110946,12,1,269946.110946,1.540923e+06,1.000000,0.175185,0.175185
D_44_B_2_sub_last,115787.386146,8,1,269946.110946,1.540923e+06,0.428928,0.075142,0.250326
D_44_B_2_ratio_last,103434.938438,12,1,269946.110946,1.540923e+06,0.383169,0.067125,0.317452
B_1_P_2_sub_last,95621.583405,5,1,269946.110946,1.540923e+06,0.354225,0.062055,0.379506
P_2_B_1_sub_last,59811.817690,8,1,269946.110946,1.540923e+06,0.221569,0.038816,0.418322
...,...,...,...,...,...,...,...,...
P_3_B_18_multi_last,0.000000,0,1,269946.110946,1.540923e+06,0.000000,0.000000,1.000000
P_3_B_1_multi_last,0.000000,0,1,269946.110946,1.540923e+06,0.000000,0.000000,1.000000
P_3_B_1_sub_last,0.000000,0,1,269946.110946,1.540923e+06,0.000000,0.000000,1.000000
P_3_B_2_multi_last,0.000000,0,1,269946.110946,1.540923e+06,0.000000,0.000000,1.000000


In [None]:
fixed_params ={
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'dart',
    #'device': 'gpu',
    'histogram_pool_size' : 2000
}

search_params ={
    'num_leaves': 100,
    'learning_rate': 0.01,
    'colsample_bytree': 0.20,
    'subsample_freq': 10,
    'subsample': 0.50,
    'reg_lambda': 2,
    'min_child_samples': 40,
    'n_estimators': 500
}

cut_offs = [0.8, 0.85, 0.9, 9.5]
scores =[]
for cut_off in cut_offs:
    features = list(feature_importance[feature_importance['percent_of_total_cumulative']<cut_off].reset_index()['feature_name'])

    model = LGBMClassifier(**fixed_params, **search_params)
    fit_params = {'callbacks': [log_evaluation(period=1000)], 'eval_metric': custom_metric_lgbm}

    score, _ = kfold_training(model, x_input[features], y_input, fit_params=fit_params)
    scores.append(score)

In [None]:
scores
#0.7625

[0.7623336149608031,
 0.7624564944340936,
 0.7626718636848588,
 0.7624094427285708]

In [None]:
#I think the memory consuming here is used for the histogram cache. it needs about num_leaves * 20Bytes * num_features * num_bins.

def objective_tune(trial: Trial) -> float:
    search_params = { 
        'learning_rate' : trial.suggest_loguniform('learning_rate', .01, 0.3),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 100, 10000, step=100),
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 15),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 15),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000, log=True),
    }

    fixed_params={
        'objective': 'binary',
        'metric': 'None',
        #'eval_metric': custom_metric,
        'boosting_type' : 'dart',
        'force_row_wise' : True,
        #'device': 'gpu',
        'max_bin': 255,
        'random_state' : random_state,
        'extra_trees' : True,
        'feature_pre_filter': False,
        'histogram_pool_size' : 2000,
        'early_stopping_round': 5
    }
    model = LGBMClassifier(**fixed_params, **search_params)

    y_train_numpy = np.array(y_train).squeeze()
    y_test_numpy = np.array(y_test).squeeze()
    
    model.fit(x_train,y_train_numpy, eval_set=[(x_test, y_test_numpy)], eval_metric=custom_metric_lgbm, callbacks =[log_evaluation(period=0)])
    
    preds = model.predict_proba(x_test)[:,1]
    
    gc.collect()
    return amex_metric_numpy(y_test_numpy,preds)


# print('Parameter Tuning is starting now...')
# study = optuna.create_study(direction='maximize',sampler=TPESampler())
# study.optimize(lambda trial : objective_tune(trial),n_trials= 70)
# print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [20]:
fixed_params ={
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'dart',
    #'device': 'gpu',
    'histogram_pool_size' : 2000
}

search_params ={
    'num_leaves': 100,
    'learning_rate': 0.01,
    'colsample_bytree': 0.20,
    'subsample_freq': 10,
    'subsample': 0.50,
    'reg_lambda': 2,
    'min_child_samples': 40,
    'n_estimators': 10500
}



model = LGBMClassifier(**fixed_params, **search_params)
fit_params = {'callbacks': [log_evaluation(period=1000)], 'eval_metric': custom_metric_lgbm}

score, models = kfold_training(model, x_input, y_input, fit_params=fit_params)
# model.fit(x_train,y_train, eval_set=[(x_test, y_test)], eval_metric=custom_metric_lgbm, callbacks =[log_evaluation(period=0)], verbose=10)
#eval_model_classification_report( x_input, y_input, model)


starting cross validation training
****************************************************************************************************
                    Fold: 0
****************************************************************************************************
[1000]	valid_0's binary_logloss: 0.248942	valid_0's amex_metric: 0.778506
[2000]	valid_0's binary_logloss: 0.226258	valid_0's amex_metric: 0.787849
[3000]	valid_0's binary_logloss: 0.220707	valid_0's amex_metric: 0.792286
[4000]	valid_0's binary_logloss: 0.21847	valid_0's amex_metric: 0.793615
[5000]	valid_0's binary_logloss: 0.217641	valid_0's amex_metric: 0.79406
[6000]	valid_0's binary_logloss: 0.21692	valid_0's amex_metric: 0.795695
[7000]	valid_0's binary_logloss: 0.216486	valid_0's amex_metric: 0.795777
[8000]	valid_0's binary_logloss: 0.216227	valid_0's amex_metric: 0.796283
[9000]	valid_0's binary_logloss: 0.216098	valid_0's amex_metric: 0.7965
[10000]	valid_0's binary_logloss: 0.216019	valid_0's amex_metric: 0.796267

In [26]:
feature_importance = get_feature_importance_lgbm_models(models)

In [27]:
del x_input, y_input
gc.collect()

44247

In [22]:
save_models(models, 'lgb_10k_2')

saving model_lgb_10k_2_0
saving model_lgb_10k_2_1
saving model_lgb_10k_2_2
saving model_lgb_10k_2_3
saving model_lgb_10k_2_4


In [28]:
predictions = []
predictions_single = []
first_chunk_only= False
a_amex_helper.chunksize = 2000000
with pd.read_csv(filename_test, chunksize=a_amex_helper.chunksize) as reader:
    for i, chunk in enumerate(reader):
        print(f'Reading chunk: {i+1}')
        
        
        x_score = a_amex_helper.process_chunk(chunk)
        x_score = x_score.reset_index()
        output = pd.DataFrame(x_score['customer_ID'].copy())
        output_single = pd.DataFrame(x_score['customer_ID'].copy())
        x_score.drop('customer_ID', axis=1, inplace=True)
        
        
        #score each of models from cross validation
        best_iter = 0
        score = np.zeros(len(x_score))
        for i, model in enumerate(models): 
            score += model.predict_proba(x_score)[:,1]
            if i == best_iter:
                output_single["prediction"] = model.predict_proba(x_score)[:,1]

        #weight each one equally
        score /= len(models)
        output["prediction"] = score
       
        #x_score = column_transformer.transform(x_score)
        
        
        predictions.append(output)
        predictions_single.append(output_single)

        if first_chunk_only:
            break
        
predictions_single=pd.concat(predictions_single).sort_index().reset_index(drop=True)        
predictions=pd.concat(predictions).sort_index().reset_index(drop=True)

Reading chunk: 1
filling na
filling 0
Doing Aggregates
Reading chunk: 2
filling na
filling 0
Doing Aggregates
Reading chunk: 3
filling na
filling 0
Doing Aggregates
Reading chunk: 4
filling na
filling 0
Doing Aggregates
Reading chunk: 5
filling na
filling 0
Doing Aggregates
Reading chunk: 6
filling na
filling 0
Doing Aggregates


In [None]:
#924621
len(predictions)

In [None]:
predictions.to_csv("submission.csv", index=False)
predictions_single.to_csv("submission_best_iter.csv", index=False)