In [1]:
# safe downcast
def sd(col, max_loss_limit=0.001, avg_loss_limit=0.001, na_loss_limit=0, n_uniq_loss_limit=0, fillna=0):
    """
    max_loss_limit - don't allow any float to lose precision more than this value. Any values are ok for GBT algorithms as long as you don't unique values.
                     See https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations_on_decimal_values_in_[0,_1]
    avg_loss_limit - same but calculates avg throughout the series.
    na_loss_limit - not really useful.
    n_uniq_loss_limit - very important parameter. If you have a float field with very high cardinality you can set this value to something like n_records * 0.01 in order to allow some field relaxing.
    """
    is_float = str(col.dtypes)[:5] == 'float'
    na_count = col.isna().sum()
    n_uniq = col.nunique(dropna=False)
    try_types = ['float16', 'float32']

    if na_count <= na_loss_limit:
        try_types = ['int8', 'int16', 'float16', 'int32', 'float32']

    for type in try_types:
        col_tmp = col

        # float to int conversion => try to round to minimize casting error
        if is_float and (str(type)[:3] == 'int'):
            col_tmp = col_tmp.copy().fillna(fillna).round()

        col_tmp = col_tmp.astype(type)
        max_loss = (col_tmp - col).abs().max()
        avg_loss = (col_tmp - col).abs().mean()
        na_loss = np.abs(na_count - col_tmp.isna().sum())
        n_uniq_loss = np.abs(n_uniq - col_tmp.nunique(dropna=False))

        if max_loss <= max_loss_limit and avg_loss <= avg_loss_limit and na_loss <= na_loss_limit and n_uniq_loss <= n_uniq_loss_limit:
            return col_tmp

    # field can't be converted
    return col


def reduce_mem_usage(df, deep=True, verbose=False, obj_to_cat=False):
    numerics = ['int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes

        # collect stats
        na_count = df[col].isna().sum()
        n_uniq = df[col].nunique(dropna=False)
        
        # numerics
        if col_type in numerics:
            df[col] = sd(df[col])

        # strings
        if (col_type == 'object') and obj_to_cat:
            df[col] = df[col].astype('category')
        
        if verbose:
            print(f'Column {col}: {col_type} -> {df[col].dtypes}, na_count={na_count}, n_uniq={n_uniq}')
        new_na_count = df[col].isna().sum()
        if (na_count != new_na_count):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost na values. Before: {na_count}, after: {new_na_count}')
        new_n_uniq = df[col].nunique(dropna=False)
        if (n_uniq != new_n_uniq):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost unique values. Before: {n_uniq}, after: {new_n_uniq}')

    end_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    percent = 100 * (start_mem - end_mem) / start_mem
    if verbose:
        print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))
    return df

 From notebooks of
 https://www.kaggle.com/xhlulu/ieee-fraud-xgboost-with-gpu-fit-in-40s  
 https://www.kaggle.com/davidcairuz/feature-engineering-lightgbm
https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda

In [2]:
print('loading libs...')
import warnings
warnings.filterwarnings("ignore")
import os
import gc
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import datetime
import time

import pickle
def save_to_disk(obj, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

import matplotlib.pyplot as plt

%matplotlib inline

loading libs...


In [3]:
%%time
print('loading data...')
train = pd.read_pickle('../input/ieee-fe-with-some-eda/train_df.pkl')
test = pd.read_pickle('../input/ieee-fe-with-some-eda/test_df.pkl')
remove_features = pd.read_pickle('../input/ieee-fe-with-some-eda/remove_features.pkl')
sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
print('done')


loading data...
done
CPU times: user 2.98 s, sys: 12.9 s, total: 15.9 s
Wall time: 16.6 s


In [4]:
%%time
print('dropping target...')
y_train = train['isFraud'].copy()
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()
train_cols = list(train.columns)
del train, test
gc.collect()
print('selecting features...')
remove_features = list(remove_features['features_to_remove'].values)
features_columns = [col for col in train_cols if col not in set(remove_features) - set(['TransactionDT'])]
X_train = X_train[features_columns]
X_test = X_test[features_columns]
print('Done')

dropping target...
selecting features...
Done
CPU times: user 2.93 s, sys: 15.7 s, total: 18.6 s
Wall time: 11.4 s


In [5]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

In [6]:
params = {
          'objective':'binary',
          'boosting_type':'gbdt',
          'metric':'auc',
          'n_jobs':-1,
          'max_depth':-1,
          'tree_learner':'serial',
          'min_data_in_leaf':30,
          'n_estimators':1800,
          'max_bin':255,
          'verbose':-1,
          'seed': 122,
          'learning_rate': 0.01,
          'early_stopping_rounds':200,
          'colsample_bytree': 0.5,          
          'num_leaves': 256, 
          'reg_alpha': 0.35, 
         }

In [7]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
TransactionDT1 = X_train['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
dt_m = TransactionDT1.dt.date.astype('str').str[:7]
X_train = X_train.drop(['TransactionDT'], axis=1)
X_test = X_test.drop(['TransactionDT'], axis=1)
X_train.shape, X_test.shape

((590540, 772), (506691, 772))

In [8]:
%%time

y_preds = np.zeros(X_test.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = X_train.columns

NFOLDS = len(np.unique(dt_m)) # 6
  
for fold_n, month in enumerate(np.unique(dt_m)):
    X_tr, X_val = X_train[~dt_m.isin([month])], X_train[dt_m.isin([month])]
    y_tr, y_val = y_train[~dt_m.isin([month])], y_train[dt_m.isin([month])]  
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dvalid = lgb.Dataset(X_val, label=y_val)
    clf = lgb.train(params, dtrain,  valid_sets = [dtrain, dvalid], verbose_eval=500)      
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_val)
    save_to_disk(y_pred_valid, 'y_pred_valid_fold{}.pkl'.format(fold_n))
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_val, y_pred_valid)}")   
    
    score += roc_auc_score(y_val, y_pred_valid) / NFOLDS
    
    y_pred_test = clf.predict(X_test)
    save_to_disk(y_pred_test, 'y_pred_test_fold{}.pkl'.format(fold_n))
    y_preds += y_pred_test / NFOLDS

    del X_tr, X_val, y_tr, y_val
    gc.collect()

print(f"\nMean AUC = {score}")

Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.996253	valid_1's auc: 0.908897
[1000]	training's auc: 0.999898	valid_1's auc: 0.9163
[1500]	training's auc: 0.999998	valid_1's auc: 0.917166
Early stopping, best iteration is:
[1389]	training's auc: 0.999995	valid_1's auc: 0.917532
Fold 1 | AUC: 0.9174466202015826
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.996012	valid_1's auc: 0.940146
[1000]	training's auc: 0.999877	valid_1's auc: 0.94411
[1500]	training's auc: 0.999997	valid_1's auc: 0.944319
Early stopping, best iteration is:
[1347]	training's auc: 0.999989	valid_1's auc: 0.944538
Fold 2 | AUC: 0.9445456671173191
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.995733	valid_1's auc: 0.94875
[1000]	training's auc: 0.999856	valid_1's auc: 0.950996
[1500]	training's auc: 0.999997	valid_1's auc: 0.951074
Early stopping, best iteration is:
[1321]	training's auc: 0.999986

# Submission

In [9]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
sub['isFraud'] = y_preds
sub.to_csv('submission.csv', index=False)

In [10]:
feature_importances['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(NFOLDS)]].mean(axis=1)
feature_importances.to_csv('feature_importances.csv')

plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('50 TOP feature importance over {} folds average'.format(NFOLDS));

NameError: name 'sns' is not defined

<Figure size 1152x1152 with 0 Axes>