In [None]:
import numpy as np 
import pandas as pd 

import lightgbm as lgb
import catboost

from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sp

import warnings, random, gc, os,datetime, shap
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
SEED = 4242
set_seed(SEED)

lgb_params = {
    'objective':'cross_entropy',
    'boosting_type':'gbdt',
    'metric':'auc',
    'n_jobs':-1,
    'seed': SEED,
    'is_unbalance': True,
    
    'learning_rate':0.01,

    'num_leaves': 60,
    'max_depth': 7,
    'subsample_freq':1,
    'subsample':0.7,
    'colsample_bytree': 0.7
    } 

# load

In [None]:
def get_df(df_path, dtypes_path):
    path = '../input/ieee-cis-data/'
    df_dtypes = pd.read_csv(path + dtypes_path)
    df_dtypes = df_dtypes.rename(columns={'TransactionID': 'col','int32': 'type'})
    df_dtypes = df_dtypes.to_dict(orient='records')
    new_dtypes = dict()
    for rec in df_dtypes:
        new_dtypes[rec['col']] = rec['type']
    new_dtypes['TransactionID'] = 'int32'

    df = pd.read_csv(path+df_path, dtype=new_dtypes)
    del df_dtypes, new_dtypes
    gc.collect()
    return df

def load_data():
    train = get_df('train.csv', 'train_dtypes.csv')
    test = get_df('test.csv', 'test_dtypes.csv')
    sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
    return train, test, sub

def get_x_y(train, test):
    X = train.sort_values('TransactionDT').drop(['isFraud',  'TransactionID','TransactionDT'], axis=1)
    y_train = train.sort_values('TransactionDT')['isFraud']
    X_test = test.sort_values('TransactionDT').drop(['TransactionID','TransactionDT'], axis=1)
    del train
    gc.collect()
    test = test[["TransactionDT", 'TransactionID']]
    return X,y_train,X_test, test

def reduce_memory_usage(df, cols=None):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024 ** 2
        if cols is None:
            cols = df.columns
        for col in tqdm(cols):
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        end_mem = df.memory_usage().sum() / 1024 ** 2
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))

# adv

In [None]:
def get_adversarial_train(X, X_test,seed=SEED):
    X["is_test"] = 0
    X_test["is_test"] = 1
    assert(np.all(X_train.columns == X_test.columns))
    
    print('Concat')
    total = pd.concat([X, X_test])
#     X.drop('is_test',1,inplace=True)
#     X_test.drop('is_test',1,inplace=True)
    del X, X_test
    gc.collect()
    
    X_split = total.drop(["is_test"], axis = 1)
    y_split = total.is_test
    del total
    gc.collect()
    
    print('Split')
    dataX_train, dataX_valid, datay_train, datay_valid = train_test_split(X_split,y_split, test_size=0.2, random_state=seed)
    del X_split,y_split
    gc.collect()
    
    dtrain = lgb.Dataset(data=dataX_train, label=datay_train,free_raw_data=False)
    del dataX_train, datay_train
    gc.collect()
    
    dval = lgb.Dataset(data=dataX_valid, label=datay_valid,free_raw_data=False)
    del dataX_valid, datay_valid
    gc.collect()

    print('Train')
    clf = lgb.train(lgb_params, dtrain, 
                    num_boost_round=1000,
                    verbose_eval=200,
                    early_stopping_rounds=100, 
                    valid_sets = [dtrain, dval])
    
    feature_importance = pd.DataFrame()
    feature_importance["feature"] = X.columns
    feature_importance["importance"] = clf.feature_importance()

    cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values("importance", ascending=False)[:50].index

    best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
    plt.figure(figsize=(16, 12))
    sns.barplot(x="importance",
                    y="feature",
                    data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LGB Features (avg over folds)')
    
def adv(X, X_test):
    a,b = get_adversarial_train(X,X_test)
    del a,b
    gc.collect()
    X.drop(['is_test','target'],1,inplace=True)
    X_test.drop(['is_test','target'],1,inplace=True)

# eval

In [None]:
def eval_ho(X,y, X_test=None):
    X_train = X.head(550000)
    Y_train = y[X_train.index]
    X_valid = X.tail(40540)
    Y_valid = y[X_valid.index]
    columns = list(X.columns)
    
    del X, y_train
    gc.collect()
    dtrain = lgb.Dataset(data=X_train, label=Y_train)
    dval = lgb.Dataset(data=X_valid, label=Y_valid)
    
    clf = lgb.train(lgb_params, dtrain, 
                    num_boost_round=1000,
                    verbose_eval=200,
                    early_stopping_rounds=100, 
                    valid_sets = [dtrain, dval])
    del dval, dtrain
    gc.collect()
    
    oof = clf.predict(X_valid)
    train_pred = clf.predict(X_train)
    
    print(f'TRAIN AUC: {roc_auc_score(Y_train, train_pred)}\t TRAIN LOG: {log_loss(Y_train,  train_pred)}')
    print(f'VALID AUC: {roc_auc_score(Y_valid, oof)}\t VALID LOG: {log_loss(Y_valid,  oof)}')
    del oof, train_pred
    gc.collect()
    
    feature_importance = pd.DataFrame()
    feature_importance["feature"] = columns
    feature_importance["importance"] = clf.feature_importance()

    cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values("importance", ascending=False)[:50].index

    best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
    plt.figure(figsize=(16, 12))
    sns.barplot(x="importance",
                    y="feature",
                    data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LGB Features (avg over folds)')
    del feature_importance,best_features,cols
    gc.collect()
    
    X= pd.concat([X_train, X_valid])
    del X_train, X_valid
    gc.collect()
    y = np.concatenate([Y_train, Y_valid])
    del Y_train, Y_valid
    gc.collect()
    
    if X_test is not None:
        pred = clf.predict(X_test)
        return clf, pred, clf.current_iteration(), X,y
    else: 
        return clf, clf.current_iteration(), X,y
    
def eval_ts(X, y):
    ts = TimeSeriesSplit(n_splits=5)
    tr_aucs = []
    tr_logs = []
    val_aucs = []
    val_logs = []
    for i,(tr,val) in enumerate(ts.split(X,y)):
        X_tr, X_val = X.iloc[tr], X.iloc[val]
        y_tr, y_val = y[tr],y[val]

        dtr = lgb.Dataset(data=X_tr, label=y_tr)
        dvl = lgb.Dataset(data=X_val, label=y_val)

        clf = lgb.train(lgb_params, dtr, 
                    num_boost_round=3000,
                    verbose_eval=False,
                    early_stopping_rounds=200, 
                    valid_sets = [dtr, dvl])

        tr_pred = clf.predict(X_tr)
        vl_pred = clf.predict(X_val)

        score_auc_tr = roc_auc_score(y_tr, tr_pred)
        score_log_tr = log_loss(y_tr,tr_pred)
        score_auc_val = roc_auc_score(y_val, vl_pred)
        score_log_val = log_loss(y_val,vl_pred)

        tr_aucs.append(score_auc_tr)
        tr_logs.append(score_log_tr)
        val_aucs.append(score_auc_val)
        val_logs.append(score_log_val)

        print('FOLD {} TRAIN auc {:.7f} log {:.7f}\t VALID auc {:.7f} log {:.7f}'.
              format(i, score_auc_tr, score_log_tr ,score_auc_val, score_log_val))
    feature_importance = pd.DataFrame()
    feature_importance["feature"] = X.columns
    feature_importance["importance"] = clf.feature_importance()

    cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values("importance", ascending=False)[:50].index

    best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
    plt.figure(figsize=(16, 12))
    sns.barplot(x="importance",
                    y="feature",
                    data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LGB Features (avg over folds)')
    print('Train AUC: mean', np.mean(tr_aucs), ', std:', np.std(tr_aucs))
    print('Valid AUC: mean', np.mean(val_aucs),', std:', np.std(val_aucs))
    print('Train LOG: mean',np.mean(tr_logs), ', std:',np.std(tr_logs))
    print('Valid LOG: mean',np.mean(val_logs),', std:', np.std(val_logs))
    del dtr, dvl
    gc.collect()
    return clf, clf.current_iteration()

# fe

In [None]:
def fe_cat_other(train, test, obj_cols):
    print('fe_cat_other')
    for c in obj_cols:
        tr_u = set(train[c].dropna().unique())
        te_u = set(test[c].dropna().unique())
        diff= {}
        for u in tr_u:
            if u not in te_u:
                diff[u] = 'Other'
        for u in te_u:
            if u not in tr_u:
                diff[u] = 'Other'
        del tr_u, te_u
        gc.collect()
        train[c] = train[c].replace(diff)
        test[c] = test[c].replace(diff)
    del diff
    gc.collect()

    train.card6 = train.card6.replace({'charge card': 'Other'})
    test.card6 = test.card6.replace({'charge card': 'Other'})

    train.id_30 = train.id_30.replace({'func': 'Other','other':'Other'})
    test.id_30 = test.id_30.replace({'func': 'Other','other':'Other'})

def fe_mail(train, test):
    print('fe_mail')
    train['P_isproton']=(train['P_emaildomain']=='protonmail.com')
    train['R_isproton']=(train['R_emaildomain']=='protonmail.com')
    test['P_isproton']=(test['P_emaildomain']=='protonmail.com')
    test['R_isproton']=(test['R_emaildomain']=='protonmail.com')
    
    train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
    train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
    test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
    test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

def fe_id_30(train, test):
    print('fe_id_30')
    os_map = {}
    for i in train.id_30.dropna().unique():
        os_map[i] = i.split(' ')[0]

    for i in test.id_30.dropna().unique():
        if i not in train.id_30.dropna().unique():
            os_map[i] = 'Other'

            os_map['func'] = 'Other'
    os_map['other'] = 'Other'
    train.id_30 = train.id_30.map(os_map)
    test.id_30 = test.id_30.map(os_map)
    del os_map
    gc.collect()

def fe_date(train, test):
    print('fe_date')
    START_DATE = '2017-12-01'
    startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
    
    def extract_date(df):
        df["Date"] = df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
        df['Month'] = df['Date'].dt.month
        df['Weekday'] = df['Date'].dt.dayofweek
        df['Hour'] = df['Date'].dt.hour
        df['Day'] = df['Date'].dt.day
        df['DayOfYear'] = df['Date'].dt.dayofyear
    extract_date(train)
    extract_date(test)

def fe_le(train, test, obj_cols):
    print('fe_le')
    def do_le_nan(X, X_test, c):
        mask_tr = X[c].isnull()
        mask_te = X_test[c].isnull()
        le = LabelEncoder()
        le.fit(list(X[c].astype(str).values) + list(X_test[c].astype(str).values))
        X[c] = le.transform(list(X[c].astype(str).values))
        X_test[c] = le.transform(list(X_test[c].astype(str).values))
        X[c] = X[[c]].where(~mask_tr)
        X_test[c] = X_test[[c]].where(~mask_te)
        del mask_tr, mask_te, le
        gc.collect()
    for c in obj_cols:
        do_le_nan(train, test, c)
    
        
def fe_smooth(train, test, smooth_cols=None):
    print('fe_smooth')
    if smooth_cols is None:
        smooth_cols = []
        cols = list(train.columns)
        cols.remove('isFraud')
        for c in cols:
            if train[c].nunique() != test[c].nunique():
                smooth_cols.append(c)
    def distr_smoothing(X, X_test, col):
        agg_tr = X.groupby([col]).aggregate({col:'count'}).rename(columns={col:'Train'}).reset_index()
        agg_te = X_test.groupby([col]).aggregate({col:'count'}).rename(columns={col:'Test'}).reset_index()
        agg = pd.merge(agg_tr, agg_te, on=col,how='outer')

        agg['Total'] = agg['Train'] + agg['Test']
        agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
        agg[col+'_Copy'] = agg[col]

        X[col] = pd.merge(X[[col]], agg[[col,col+'_Copy']], on=col, how='left')[col+'_Copy']
        X_test[col] = pd.merge(X_test[[col]], agg[[col,col+'_Copy']], on=col, how='left')[col+'_Copy']

        del agg, agg_tr, agg_te
        gc.collect()
    for c in tqdm(smooth_cols):
        distr_smoothing(train, test, c)
    
    reduce_memory_usage(train)
    reduce_memory_usage(test)
        
def do_fe_enc(train, test,cols):
    print('do_fe_enc')
    for c in cols:
        tmp = pd.concat([train[[c]], test[[c]]])
        enc = tmp[c].value_counts().to_dict()   
        train[c+'_fq_enc'] = train[c].map(enc)
        test[c+'_fq_enc']  = test[c].map(enc)
    del tmp, enc
    gc.collect()
        
def fe_id_33(train, test):
    print('fe_id_33')
    train['id_33'] = train['id_33'].fillna('0x0')
    train['id_33_0'] = train['id_33'].apply(lambda x: x.split('x')[0]).astype(int).replace({0: np.NaN})
    train['id_33_1'] = train['id_33'].apply(lambda x: x.split('x')[1]).astype(int).replace({0: np.NaN})

    test['id_33'] = test['id_33'].fillna('0x0')
    test['id_33_0'] = test['id_33'].apply(lambda x: x.split('x')[0]).astype(int).replace({0: np.NaN})
    test['id_33_1'] = test['id_33'].apply(lambda x: x.split('x')[1]).astype(int).replace({0: np.NaN})
    
def fe_date_agg(train,test):
    print('fe_date_agg')
    for date in ['Day','Weekday','Hour']:
        mean_ = (train.groupby(date)['TransactionAmt'].transform('mean') + test.groupby(date)['TransactionAmt'].transform('mean'))/2
        train['TransactionAmt_by_'+date] = train['TransactionAmt'] / mean_
        test['TransactionAmt_by_'+date] = test['TransactionAmt'] / mean_
    train['is_holiday'] = train.Weekday.apply(lambda x: 1 if x in [5,6] else 0)
    test['is_holiday'] = test.Weekday.apply(lambda x: 1 if x in [5,6] else 0)
    
    mean_ = (train.groupby('Month')['TransactionAmt'].transform('count').mean() + test.groupby('Month')['TransactionAmt'].transform('count').mean())/2
    train['Month_rate'] = train.groupby('Month')['TransactionAmt'].transform('count') / mean_
    test['Month_rate'] = test.groupby('Month')['TransactionAmt'].transform('count') / mean_
    
    mean_ = (train.groupby('Day')['TransactionAmt'].transform('count').mean()+test.groupby('Day')['TransactionAmt'].transform('count').mean())/2
    train['Day_rate'] = train.groupby('Day')['TransactionAmt'].transform('count') / mean_
    test['Day_rate'] = test.groupby('Day')['TransactionAmt'].transform('count') / mean_
    
    mean_ = (train.groupby('Weekday')['TransactionAmt'].transform('count').mean()+test.groupby('Weekday')['TransactionAmt'].transform('count').mean())/2
    train['Weekday_rate'] = train.groupby('Weekday')['TransactionAmt'].transform('count') / mean_
    test['Weekday_rate'] = test.groupby('Weekday')['TransactionAmt'].transform('count') / mean_
    train.drop(['Date','Month','Day','Hour','Weekday','DayOfYear'],1,inplace=True)
    test.drop(['Date','Month','Day','Hour','Weekday','DayOfYear'],1,inplace=True)
    

def fe_dec(train, test):
    print('fe_dec')
    def change(hoge):
        num = 3
        hoge = int(hoge*1000)
        while(hoge % 10 ==0):
            num = num-1
            hoge = hoge /10
        if num<0:
            num = 0
        return num

    train["TransactionAmt_decimal"] = train["TransactionAmt"].map(change)
    test["TransactionAmt_decimal"] = test["TransactionAmt"].map(change)
    
def tr_bin(train, test):
    train.TransactionAmt = np.log(train.TransactionAmt)
    test.TransactionAmt = np.log(test.TransactionAmt)
    q, bins = pd.qcut(train.TransactionAmt, 5, retbins=True)
    train.TransactionAmt = q
    test.TransactionAmt = pd.cut(test.TransactionAmt, bins)
    
def fe_nan(train, test):
    print('fe_nan')
    cols = list(train.columns)
    cols.remove('isFraud')
    cols.remove('TransactionDT')
    cols.remove('TransactionID')
    for c in cols:
        train[c+'_nan'] = np.where(train[c].isna(),0,1)
        test[c+'_nan'] = np.where(test[c].isna(),0,1)
        

# pseudo + influence

In [None]:
def get_pseudo(train,test, preds):
    test['target'] = preds
    data = test[(test['target']<=0.01) | (test2['target']>=0.99) ].copy()
    print('Added ',data.shape[0], 'new instances.')
    data.loc[data['target']>=0.5, 'target' ] = 1
    data.loc[data['target']<0.5, 'target' ] = 0
    train = pd.concat([train,data],axis=0)
    train.reset_index(drop=True,inplace=True)
    test.drop('target',1,inplace=True)
    return train, test

def get_weights(X,y, X_pseudo):
    imps = catboost.CatBoostClassifier().get_object_importance(catboost.Pool(data=X_pseudo.drop('target'), label=X_pseudo.target),
                                                               catboost.Pool(data=X, label=y))
    return imps

def do_augment(x, y, t=2):
    xp, xn = [], []
    for i in range(t):
        mask = y > 0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:, c] = x1[ids][:, c]
        xp.append(x1)

    for i in range(t // 2):
        mask = y == 0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:, c] = x1[ids][:, c]
        xn.append(x1)

    xp = np.vstack(xp)
    xn = np.vstack(xn)
    yp = np.ones(xp.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x, xp, xn])
    y = np.concatenate([y, yp, yn])
    return x, y

# interactions

In [None]:
pairs_cols=[['C1', 'V258'],
 ['C1', 'C14'],
 ['C1', 'C13'],
 ['C4', 'V258'],
 ['C14', 'V294'],
 ['C7', 'V258'],
 ['V258', 'V294'],
 ['V294', 'card6'],
 ['C11', 'C13'],
 ['C14', 'C8'],
 ['C11', 'C14'],
 ['C11', 'V70'],
 ['C8', 'V258'],
 ['C11', 'D2'],
 ['C14', 'C4'],
 ['C14', 'D2'],
 ['C1', 'V189'],
 ['V312', 'V70'],
 ['V189', 'V258'],
 ['C14', 'V317'],
 ['C14', 'V308'],
 ['C13', 'D2'],
 ['C4', 'D2'],
 ['V70', 'card6'],
 ['C1', 'C4'],
 ['V189', 'V45'],
 ['C12', 'C13'],
 ['V91', 'card6'],
 ['C8', 'D2'],
 ['C1', 'V201'],
 ['C12', 'V258'],
 ['C1', 'C5'],
 ['C13', 'C7'],
 ['C11', 'V258'],
 ['V201', 'V294'],
 ['C14', 'V283'],
 ['C13', 'V45'],
 ['C5', 'V312'],
 ['C14', 'id_17'],
 ['C13', 'C8'],
 ['C11', 'V91'],
 ['C13', 'C13'],
 ['D2', 'V308'],
 ['V149', 'card6'],
 ['card2', 'id_01'],
 ['V283', 'card6'],
 ['C7', 'V156'],
 ['C1', 'R_emaildomain'],
 ['V294', 'V62'],
 ['V317', 'card6'],
 ['V258', 'V45'],
 ['C14', 'V189'],
 ['V294', 'V45'],
 ['C13', 'C4'],
 ['D2', 'TransactionAmt'],
 ['C14', 'V156'],
 ['V62', 'card6'],
 ['ProductCD', 'TransactionAmt'],
 ['C11', 'C5'],
 ['V187', 'V258'],
 ['C14', 'addr2'],
 ['C14', 'V312'],
 ['TransactionAmt', 'card2'],
 ['V129', 'V70'],
 ['V283', 'V308'],
 ['TransactionAmt', 'card3'],
 ['C1', 'id_17'],
 ['C14', 'R_emaildomain'],
 ['C1', 'D2'],
 ['C4', 'V294'],
 ['V244', 'V45'],
 ['V258', 'V258'],
 ['C7', 'V149'],
 ['D3', 'V294'],
 ['D10', 'card6'],
 ['V149', 'id_30'],
 ['M5', 'V45'],
 ['D2', 'V62'],
 ['C1', 'V169'],
 ['V189', 'V294'],
 ['C12', 'V294'],
 ['C1', 'C11'],
 ['C4', 'V317'],
 ['D2', 'V83'],
 ['C1', 'V223'],
 ['D3', 'card6'],
 ['C14', 'id_01'],
 ['C8', 'V317'],
 ['C14', 'V333'],
 ['C14', 'card3'],
 ['V129', 'V91'],
 ['C1', 'V244'],
 ['V244', 'V294'],
 ['C13', 'V317'],
 ['V128', 'V91'],
 ['V12', 'V317'],
 ['V149', 'V156'],
 ['V243', 'V317'],
 ['V223', 'V244'],
 ['C1', 'id_13']]

def fe_pairs(train, test):
    print('fe_pairs')
    inter_cols = []
    def do_pair(c1,c2):
        print(c1,c2)
        train[c1+'_'+c2] = train[c1] * train[c2]
        test[c1+'_'+c2] = test[c1] * test[c2]
        return c1+'_'+c2
    for c in pairs_cols:
        inter_cols.append(do_pair(c[0],c[1]))
    return inter_cols

# under, over sampling

In [None]:
# TransactionAmt, dist - RobustScaler, np.log
# card, addr - categories

In [None]:
gc.collect()

In [None]:
train, test,sub = load_data()
train.drop('id_31',1,inplace=True)
test.drop('id_31',1,inplace=True)

In [None]:
# fe_nan(train, test)
fe_id_30(train,test)
fe_id_33(train,test)
fe_mail(train,test)
fe_dec(train, test)

fe_date(train,test)
fe_date_agg(train,test)
tr_bin(train, test)

obj_cols = list(train.select_dtypes('object').columns)
num_cols = [c for c in train.columns if c not in obj_cols]

fe_cat_other(train,test,obj_cols)
obj_cols.append('TransactionAmt')
fe_le(train,test, obj_cols)
pairs_cols = fe_pairs(train, test)
# fe_smooth(train,test)
do_fe_enc(train, test, obj_cols)

In [None]:
X = train.drop(['isFraud',  'TransactionID','TransactionDT'], axis=1)
y = train['isFraud']
del train
gc.collect()

X_test = test.drop(['TransactionID','TransactionDT'], axis=1)
test = test[["TransactionDT", 'TransactionID']]

In [None]:
# adv(X,X_test)

In [None]:
# TRAIN AUC: 0.9497488486026882	 TRAIN LOG: 0.06640471851123202
# VALID AUC: 0.9269320753003503	 VALID LOG: 0.08852609545761803 base

# TRAIN AUC: 0.9509330521028726	 TRAIN LOG: 0.06577636329065795
# VALID AUC: 0.9241533723666887	 VALID LOG: 0.0894685065111738

In [None]:
X_train = X.head(550000)
Y_train = y[X_train.index]
X_valid = X.tail(40540)
Y_valid = y[X_valid.index]
del X
gc.collect()

In [None]:
dtrain = lgb.Dataset(data=X_train, label=Y_train, free_raw_data=False)
del X_train, Y_train
gc.collect()
dval = lgb.Dataset(data=X_valid, label=Y_valid, free_raw_data=False)
del X_valid, Y_valid
gc.collect()

In [None]:
clf = lgb.train(lgb_params, dtrain, 
                num_boost_round=1000,
                verbose_eval=200,
                early_stopping_rounds=100, 
                valid_sets = [dtrain, dval])

In [None]:
feature_importance = pd.DataFrame()
feature_importance["feature"] = X_test.columns
feature_importance["importance"] = clf.feature_importance()
cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values("importance", ascending=False)[:50].index
best_features = feature_importance.loc[feature_importance.feature.isin(cols)]
plt.figure(figsize=(16, 12))
sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance", ascending=False))
plt.title('LGB Features (avg over folds)')

In [None]:
X_train = dtrain.get_data()
Y_train = dtrain.get_label()
del dtrain
gc.collect()

X_valid = dval.get_data()
Y_valid = dval.get_label()
del dval
gc.collect()

In [None]:
oof = clf.predict(X_valid)
train_pred = clf.predict(X_train)
print(f'TRAIN AUC: {roc_auc_score(Y_train, train_pred)}\t TRAIN LOG: {log_loss(Y_train,  train_pred)}')
print(f'VALID AUC: {roc_auc_score(Y_valid, oof)}\t VALID LOG: {log_loss(Y_valid,  oof)}')

In [None]:
X = pd.concat([X_train,X_valid])
del X_train, X_valid
gc.collect()

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(clf)
values = explainer.shap_values(X.tail(40000))
shap.summary_plot(values, X.tail(40000), plot_type="bar")

In [None]:
# clf,num_iter =eval_ts(X,y)

In [None]:
dtrain=lgb.Dataset(data=X, label=y)
del X
gc.collect()
num_iter = clf.best_iteration
clf = lgb.train(lgb_params, dtrain, 
            num_boost_round=num_iter,
            verbose_eval=200)

In [None]:
test = test.sort_values('TransactionDT')
test['prediction'] = clf.predict(X_test)
sub['isFraud'] = pd.merge(sub, test, on='TransactionID')['prediction']
sub.to_csv('rus.csv', index=False)
sub.tail()