In [1]:
from functools import partial

import datetime
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
import pickle
import random
import time

from copy import deepcopy
from scipy.stats import rankdata
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

In [2]:
def seed_everything(seed=13):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    
def read_from_disk(path, filename):
    with open(os.path.join(path, filename), 'rb') as handle:
        return pickle.load(handle)
    
    
def save_to_disk(obj, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def timedelta(seconds):
    return str(datetime.timedelta(seconds=seconds)).split('.')[0]

# Targets

In [3]:
os.listdir('../input')

['fraud-lgb-kfold-newfeatures-seed-44',
 'fraud-fs-fe9477-no-day',
 'fraud-dima-944193',
 'fraud-artem-new-val-no-day',
 'fraud-kirill-predictions-9438',
 'fraud-dima-942747',
 'ieee-fraud-detection',
 'fraud-lgb-kfold-newfeatures-seed-111',
 'fraud-dima-937684',
 'fraud-artem-new-val-seed-99-no-day',
 'fraud-dima-cb-hope-v1-944414',
 'fraud-dima-944053',
 'fraud-lgb-kfold-newfeatures',
 'fraud-fs-fe9477-no-day-seed-144',
 'fraud-misha-9436',
 'fraud-fs-fe9477-no-day-seed-133',
 'fraud-catboost-dima-943978',
 'fraud-dima-cb-hope-v2-943496',
 'fraud-dima-940490',
 'fraud-mix-lgbm-seed-101-no-day',
 'fraud-validation-new',
 'fraud-dima-943354',
 'fraud-mix-lgbm-no-day',
 'fraud-lgb-kfold-newfeatures-seed-122',
 'fraud-dima-943304',
 'fraud-fs-fe9477-no-day-seed-88',
 'fraud-kirill-predictions-9459',
 'fraud-fs-fe9477-catboost-no-cat']

In [4]:
TARGET_PATH = '../input/fraud-validation-new'
NFOLDS = 6

y_val_array = []
for i in range(NFOLDS):
    y_val = read_from_disk(TARGET_PATH, 'y_val_fold{}.pkl'.format(i))
    print('Fold {}:'.format(i), y_val.shape)
    y_val_array.append(y_val.values)

Fold 0: (137321,)
Fold 1: (92585,)
Fold 2: (86021,)
Fold 3: (101632,)
Fold 4: (83655,)
Fold 5: (89326,)


# Model predictions

In [5]:
NUM_MODELS = 26
MODEL_DIRS = [
    '../input/fraud-lgb-kfold-newfeatures',
    '../input/fraud-fs-fe9477-no-day-seed-144',
    '../input/fraud-lgb-kfold-newfeatures-seed-122',
    '../input/fraud-lgb-kfold-newfeatures-seed-44',
    '../input/fraud-lgb-kfold-newfeatures-seed-111',
    '../input/fraud-dima-944193',
    '../input/fraud-fs-fe9477-no-day-seed-88',
    '../input/fraud-dima-944053',
    '../input/fraud-fs-fe9477-no-day-seed-133',
    '../input/fraud-catboost-dima-943978',
    '../input/fraud-fs-fe9477-no-day',
    '../input/fraud-misha-9436',
    '../input/fraud-dima-943354',
    '../input/fraud-dima-943304',
    '../input/fraud-artem-new-val-seed-99-no-day',
    '../input/fraud-artem-new-val-no-day',
    '../input/fraud-dima-942747',
    '../input/fraud-mix-lgbm-no-day',
    '../input/fraud-mix-lgbm-seed-101-no-day',
    '../input/fraud-dima-940490',
    '../input/fraud-catboost-dima-943978',
    '../input/fraud-fs-fe9477-catboost-no-cat',
    '../input/fraud-kirill-predictions-9459',
    '../input/fraud-dima-cb-hope-v1-944414',
    '../input/fraud-kirill-predictions-9438',
    '../input/fraud-dima-cb-hope-v2-943496'
]
cv_scores = [
    0.94450504,
    0.9444475,
    0.9444014,
    0.94436288,
    0.9441978,
    0.944193,
    0.94408984,
    0.944053,
    0.9440478,
    0.943978,
    0.94397184,
    0.9436,
    0.943354,
    0.943304,
    0.94317371,
    0.94314358,
    0.942747,
    0.9426038,
    0.9424485,
    0.940490,
    0.937684,
    0.9325177,
    0.9459,
    0.944414,
    0.9438,
    0.943496
]
MODEL_NAMES = ['model_{}'.format(str(x).split('.')[1]) for x in cv_scores]

val_preds = []
test_preds = []
for i in range(NFOLDS):
    val_preds_fold = pd.DataFrame()
    test_preds_fold = pd.DataFrame()
    print('Fold', i)
    for j in range(NUM_MODELS):
        if 'misha' in MODEL_DIRS[j]:
            y_pred_val = read_from_disk(MODEL_DIRS[j], 'y_val_pred_fold{}.pkl'.format(i))
            y_pred_test = read_from_disk(MODEL_DIRS[j], 'y_test_pred_fold{}.pkl'.format(i))
        else:
            y_pred_val = read_from_disk(MODEL_DIRS[j], 'y_pred_valid_fold{}.pkl'.format(i))
            y_pred_test = read_from_disk(MODEL_DIRS[j], 'y_pred_test_fold{}.pkl'.format(i))

        val_preds_fold['{}_fold{}'.format(MODEL_NAMES[j], i)] = y_pred_val
        test_preds_fold['{}_fold{}'.format(MODEL_NAMES[j], i)] = y_pred_test
        
        print(MODEL_NAMES[j], roc_auc_score(y_val_array[i], y_pred_val))
    val_preds.append(val_preds_fold)
    test_preds.append(test_preds_fold)

Fold 0
model_94450504 0.9192044150287948
model_9444475 0.9229167125109011
model_9444014 0.9174466202015826
model_94436288 0.9182588617651293
model_9441978 0.9182213496872573
model_944193 0.9225717062615205
model_94408984 0.921800849275633
model_944053 0.9225550496691793
model_9440478 0.9218366725308682
model_943978 0.9181286371148675
model_94397184 0.9220750829065565
model_9436 0.9172470316889038
model_943354 0.9194929762772012
model_943304 0.918634742724612
model_94317371 0.9184197947701459
model_94314358 0.9172053775734671
model_942747 0.9203007388809614
model_9426038 0.921639236108881
model_9424485 0.9217879409430095
model_94049 0.9181943980152754
model_937684 0.9181286371148675
model_9325177 0.9133559064202742
model_9459 0.9116326650726736
model_944414 0.9241696672924647
model_9438 0.9135601718345446
model_943496 0.922756312264148
Fold 1
model_94450504 0.944781470049839
model_9444475 0.9455159331722646
model_9444014 0.9445456671173191
model_94436288 0.9444888891723182
model_9441978

# Stacking

In [6]:
def train_results(params, skf, val_preds_fold, y_val):
    cv_scores = []
    models = []
    y_clf_pred = np.zeros_like(y_val)
    for train_index, test_index in skf.split(np.array(val_preds_fold), y_val):
        X_train, X_test = val_preds_fold.iloc[train_index], val_preds_fold.iloc[test_index]
        y_train, y_test = y_val[train_index], y_val[test_index]
        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_test, label=y_test)
        clf = lgb.train(params, dtrain, valid_sets = [dtrain, dvalid], verbose_eval=500)
        y_test_pred = clf.predict(X_test)
        y_clf_pred[test_index] = y_test_pred
        cv_scores.append(roc_auc_score(y_test, y_test_pred))
        models.append(clf)
    cv_scores = np.array(cv_scores)
#     model = model.fit(val_preds_fold, y_val)
    return cv_scores, models, y_clf_pred

params = {
        'objective':'binary',
        'boosting_type':'gbdt',
        'metric':'auc',
        'n_jobs':-1,
        'max_depth':-1,
        'tree_learner':'serial',
        'min_data_in_leaf':30,
        'n_estimators':1800,
        'max_bin':255,
        'verbose':-1,
        'seed': 1229,
        'learning_rate': 0.01,
        'early_stopping_rounds':200,
        'colsample_bytree': 0.5,          
        'num_leaves': 256, 
        'reg_alpha': 0.35
         }

def models_tuning(
    val_preds_fold,
    y_val,
    n_splits=5
):
    best = {'models': None, 'score_mean': 0, 'score_std': 0}
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)
    cv_scores, models, y_clf_pred = train_results(params, skf, val_preds_fold, y_val)
    best['models'] = models
    best['score_mean'] = cv_scores.mean()
    best['score_std'] = cv_scores.std()
    return best, y_clf_pred

In [7]:
tuning_results = []
cv_score = 0
start_time = time.time()
for i in range(NFOLDS):
    start_time_fold = time.time()
    print('#################')
    print('Fold', i)
    best, y_clf_pred = models_tuning(val_preds[i], y_val_array[i])
    tuning_results.append(best)
    cv_score += best['score_mean'] / NFOLDS
    print('Best CV mean:', best['score_mean'])
    print('Best CV std:', best['score_std'])
    print('Time: fold {} | total {}'.format(timedelta(time.time() - start_time_fold), timedelta(time.time() - start_time)))
    save_to_disk(y_clf_pred, 'y_clf_pred_fold{}.pkl'.format(i))
save_to_disk(tuning_results, 'tuning_results.pkl')

#################
Fold 0
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.999325	valid_1's auc: 0.92964
Early stopping, best iteration is:
[749]	training's auc: 0.999721	valid_1's auc: 0.930214
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.999248	valid_1's auc: 0.948785
Early stopping, best iteration is:
[475]	training's auc: 0.999186	valid_1's auc: 0.949027
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.999279	valid_1's auc: 0.952919
Early stopping, best iteration is:
[779]	training's auc: 0.999725	valid_1's auc: 0.953069
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.999263	valid_1's auc: 0.94754
Early stopping, best iteration is:
[711]	training's auc: 0.99965	valid_1's auc: 0.947907
Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.999281	valid_1's auc: 0.938217
Early stopping, best iterati

In [8]:
print('Final CV score:')
print(cv_score)

Final CV score:
0.9600328079657329


# Submission

In [9]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
y_preds = np.zeros(len(sub))
for i in range(NFOLDS):
    models = tuning_results[i]['models']
    for j in range(len(models)):
        y_preds += models[j].predict(test_preds[i]) / NFOLDS / len(models)
sub['isFraud'] = y_preds
sub.to_csv('submission.csv', index=False)