In [1]:
import numpy as np 
import gc
import pandas as pd
pd.set_option("display.max_columns", 999)
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import multiprocessing

from scipy import linalg,stats

from sklearn.preprocessing import StandardScaler, normalize
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedShuffleSplit, train_test_split, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from time import time, ctime

import xgboost
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import f1_score, roc_auc_score

RANDOM_STATE = 12061985
np.random.seed(RANDOM_STATE)

# HPO
from skopt.space import Integer, Categorical, Real
from skopt.utils import use_named_args
from skopt import gp_minimize, gbrt_minimize, forest_minimize
from skopt.plots import plot_convergence
from skopt.callbacks import DeltaXStopper, DeadlineStopper, DeltaYStopper

from skopt.callbacks import EarlyStopper



## Custom methods

In [2]:
def simple_FS(threshold, train, test):
    corr_matrix = train.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    print('\nThere are %d columns to remove.' % (len(to_drop)))
    
    train = train.drop(columns = to_drop)
    test = test.drop(columns = to_drop)  
    print (f'After dropping {train.shape[1]}' + ' features remain')   
    return [train, test, to_drop]

def get_params_SKopt(model, X, Y, space, cv_search, opt_method = 'gbrt_minimize', verbose = True,  multi = False, scoring = 'neg_mean_squared_error', n_best = 50, total_time = 7200):
    @use_named_args(space)
    def objective(**params):
        model.set_params(**params)
        return -np.mean(cross_val_score(model, 
                                        X, Y, 
                                        cv=cv_search, 
                                        scoring= scoring))
    if opt_method == 'gbrt_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'EI',}
        
        reg_gp = gbrt_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [DeltaYStopper(delta = 0.01, n_best = 5), RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
        
    elif opt_method == 'forest_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'EI',}
        
        reg_gp = forest_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
        
    elif opt_method == 'gp_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'gp_hedge',}        
        
        reg_gp = gp_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
    
    TUNED_PARAMS = {} 
    for i, item in enumerate(space):
        if multi:
            TUNED_PARAMS[item.name.split('__')[1]] = reg_gp.x[i]
        else:
            TUNED_PARAMS[item.name] = reg_gp.x[i]
    
    return [TUNED_PARAMS,reg_gp]

class RepeatedMinStopper(EarlyStopper):
    """Stop the optimization when there is no improvement in the minimum.
    Stop the optimization when there is no improvement in the minimum
    achieved function evaluation after `n_best` iterations.
    """
    def __init__(self, n_best=50):
        super(EarlyStopper, self).__init__()
        self.n_best = n_best
        self.count = 0
        self.minimum = np.finfo(np.float).max

    def _criterion(self, result):
        if result.fun < self.minimum:
            self.minimum = result.fun
            self.count = 0
        elif result.fun > self.minimum:
            self.count = 0
        else:
            self.count += 1

        return self.count >= self.n_best

In [3]:
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
warnings.simplefilter('ignore')
sns.set()
%matplotlib inline

## Data loading 

In [4]:
folder_path = 'data/'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')
sub = pd.read_csv(f'{folder_path}sample_submission.csv')
# let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

## Feature engineering

In [5]:
useful_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V17',
                   'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V46', 'V47', 'V48',
                   'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71',
                   'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140',
                   'V143', 'V145', 'V146', 'V147', 'V149', 'V150', 'V151', 'V152', 'V154', 'V156', 'V158', 'V159', 'V160', 'V161',
                   'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V172', 'V173', 'V175', 'V176', 'V177',
                   'V178', 'V180', 'V182', 'V184', 'V187', 'V188', 'V189', 'V195', 'V197', 'V200', 'V201', 'V202', 'V203', 'V204',
                   'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V219', 'V220',
                   'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V231', 'V233', 'V234', 'V238', 'V239',
                   'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V249', 'V251', 'V253', 'V256', 'V257', 'V258', 'V259', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276',
                   'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292', 'V294', 'V303',
                   'V304', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323', 'V324', 'V326',
                   'V329', 'V331', 'V332', 'V333', 'V335', 'V336', 'V338', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [6]:
cols_to_drop = [col for col in train.columns if col not in useful_features]
cols_to_drop.remove('isFraud')
cols_to_drop.remove('TransactionID')
cols_to_drop.remove('TransactionDT')

In [7]:
print('{} features are going to be dropped for being useless'.format(len(cols_to_drop)))

train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

152 features are going to be dropped for being useless


In [8]:
# New feature - decimal part of the transaction amount
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

# Count encoding for card1 feature. 
# Explained in this kernel: https://www.kaggle.com/nroman/eda-for-cis-fraud-detection
train['card1_count_full'] = train['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))
test['card1_count_full'] = test['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))

# https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24

# Some arbitrary features interaction
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))
    
for feature in ['id_34', 'id_36']:
    if feature in useful_features:
        # Count encoded for both train and test
        train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
        test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
        
for feature in ['id_01', 'id_31', 'id_33', 'id_35', 'id_36']:
    if feature in useful_features:
        # Count encoded separately for train and test
        train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
        test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

In [9]:
for col in tqdm_notebook(train.columns):
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))   

HBox(children=(IntProgress(value=0, max=301), HTML(value='')))




In [10]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']
test = test.sort_values('TransactionDT').drop(['TransactionDT', 'TransactionID'], axis=1)

In [12]:
del train
gc.collect()

312

In [13]:
X.shape, test.shape

((590540, 298), (506691, 298))

In [42]:
X.fillna(-1985, inplace=True)
test.fillna(-1985, inplace=True)

In [50]:
def calc_meta_feature_class(X_t, X_TEST_t, y_t):
    
    
    Y_TEST_preds = pd.DataFrame({'ind': list(X_TEST_t.index), 
                             'prediction': [0.0000000000000] * len(X_TEST_t)})
    
    
    STATIC_PARAMS = {'metric': 'auc',
                    'n_estimators': 100,
                    'objective' : 'binary:logistic',
                    'random_state' : RANDOM_STATE,
                    'n_jobs': -1,
                }

    space_SKopt = [Integer(2, 100, name='max_depth'),
                   Integer(2, 50, name='min_child_weight'),
                   Real(0.001, .25, name='learning_rate'),            
                   Real(0.001, 1, name='subsample'),
                   Real(0.001, 5, name='reg_lambda'),
                   Real(0.001, 5, name='reg_alpha'),
                   Real(0.001, 1, name='colsample_bylevel'),
                   Real(0.001, 1, name='colsample_bytree'),
                   Real(0.001, 1, name='colsample_bynode'),
                  ]
    
        
    acc, auc, F1 = [], [], []
    oof = np.zeros(len(X_t))     

#     [X_t, X_TEST_t, dr] = simple_FS(0.95, X_t, X_TEST_t)
#     print(dr)

    start_time = time()
#     X_p = X_t.sample(frac = .01, random_state = RANDOM_STATE)
#     Y_p = y_t.loc[X_p.index]
#     cv_tune = ShuffleSplit(n_splits=1, test_size = 0.3, random_state = RANDOM_STATE)
    
    n_fold = 5
    cv_tune = StratifiedKFold(n_splits=n_fold, random_state=RANDOM_STATE, shuffle=True)
    
#     cv_tune = TimeSeriesSplit(n_splits=3)
    [TUNED_PARAMS,reg_gp] = get_params_SKopt(XGBClassifier(**STATIC_PARAMS, 
                                                           scale_pos_weight = y_t.value_counts()[0]/y_t.value_counts()[1]),
#                                                            scale_pos_weight = Y_p.value_counts()[0]/Y_p.value_counts()[1]), 
                                                             X_t, y_t, 
#                                                              X_p, Y_p, 
                                                             space_SKopt, 
                                                             cv_tune,
                                                             opt_method = 'forest_minimize',
                                                             verbose = True,
                                                             multi = False, 
                                                             scoring = 'roc_auc',
                                                             n_best = 10,
                                                             total_time = 7200)

    print('\nTime for tuning: {0:.2f} minutes'.format((time() - start_time)/60))
    NEW_PARAMS = {**STATIC_PARAMS, **TUNED_PARAMS}
    best_model = XGBClassifier(**NEW_PARAMS)

    print ('Best score', reg_gp.fun)
    print ('Best iterations', len(reg_gp.x_iters))

    best_model.n_estimators = 1000
    best_model.scale_pos_weight = y_t.value_counts()[0]/y_t.value_counts()[1]
    print(best_model)
    
    n_fold = 5
    cv = StratifiedKFold(n_splits=n_fold, random_state=RANDOM_STATE, shuffle=True)
    
#     n_fold = 5
#     cv = TimeSeriesSplit(n_splits=n_fold)
  
    for fold_n, (train_index, valid_index) in enumerate(cv.split(X_t, y_t)):
        print('\nFold', fold_n, 'started at', ctime())

        X_train = X_t.iloc[train_index,:]
        X_valid = X_t.iloc[valid_index,:]

        y_train = y_t.iloc[train_index]
        y_valid = y_t.iloc[valid_index]      

        best_model.fit(X_train, y_train, 
                        eval_metric=['auc'],
                        eval_set = [(X_valid, y_valid)],
                        verbose = False,
                        early_stopping_rounds = 50,)

        y_pred = best_model.predict(X_valid, 
                                   ntree_limit = best_model.best_iteration)

        acc.append(metrics.accuracy_score(y_valid, y_pred))
        auc.append(metrics.roc_auc_score(y_valid, y_pred))
        F1.append(metrics.f1_score(y_valid, y_pred))

        print('Best score', best_model.best_score) 
        print('Best iteration', best_model.best_iteration)  

        Y_TEST_preds.loc[:, 'prediction'] += best_model.predict_proba(X_TEST_t, ntree_limit = best_model.best_iteration)[:, 1]
        
        oof[valid_index] = y_pred


    Y_TEST_preds.loc[:, 'prediction'] /= n_fold            


    print('='*45)           
    print('CV mean accuarcy: {0:.4f}, std: {1:.4f}.'.format(np.mean(acc), np.std(acc)))
    print('CV mean AUC: {0:.4f}, std: {1:.4f}.'.format(np.mean(auc), np.std(auc)))
    print('CV mean F1: {0:.4f}, std: {1:.4f}.'.format(np.mean(F1), np.std(F1)))

    return Y_TEST_preds['prediction']

In [51]:
sub['isFraud'] = calc_meta_feature_class(X, test, y) 
sub.to_csv('xgb_hyperopt.csv', index=False)

print(sub.head())

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 258.4004
Function value obtained: -0.9518
Current minimum: -0.9518
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 497.3215
Function value obtained: -0.9539
Current minimum: -0.9539
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 455.6908
Function value obtained: -0.9615
Current minimum: -0.9615
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 141.1560
Function value obtained: -0.9041
Current minimum: -0.9615
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 146.8391
Function value obtained: -0.9161
Current minimum: -0.9615
Iteration No: 6