In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm_notebook
np.set_printoptions(suppress=True)
from sklearn import preprocessing
from tqdm import tqdm
import decimal
from random import shuffle
from time import sleep
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
import gc
import datetime
from sklearn.model_selection import train_test_split, KFold, GroupKFold
import os
from sklearn.metrics import roc_auc_score
plt.style.use('ggplot')
np.set_printoptions(suppress=True)
import random
import lightgbm as lgb

In [2]:
#Always seed the randomness of this universe
def seed_everything(seed=51):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
seed_everything(seed=51)

In [4]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [5]:
%%time
train = pd.read_csv('../input/train4.csv')
print("train finished")

train finished
CPU times: user 38.4 s, sys: 3.35 s, total: 41.7 s
Wall time: 42.4 s


In [6]:
%%time
test = pd.read_csv('../input/test4.csv')
print("test finished")

test finished
CPU times: user 32.4 s, sys: 2.52 s, total: 35 s
Wall time: 35.5 s


In [7]:
#sample_submission
sample_submission = pd.read_csv('../input/sample_submission.csv')
print(train.shape,test.shape,sample_submission.shape)

(590540, 805) (506691, 804) (506691, 2)


In [8]:
y = train['isFraud']
del train['isFraud']
gc.collect()
print(train.shape,test.shape)

(590540, 804) (506691, 804)


# ENCODE_BITS

In [9]:
# utility: encode binary 0/1 columns as bits in a single integer
def encode_bits(binary_df):
    ncols = binary_df.shape[1]
    assert ncols < 64
    return binary_df @ (1 << np.arange(ncols))

In [15]:
(1 << np.arange(14))

array([   1,    2,    4,    8,   16,   32,   64,  128,  256,  512, 1024,
       2048, 4096, 8192])

In [12]:
[x for x in train.columns if x[0] =='C']

['C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14']

In [14]:
train[[x for x in train.columns if x[0] =='C']].head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14
0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
3,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0
4,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0


In [10]:
temp = encode_bits(train[[x for x in train.columns if x[0] =='C']])

In [13]:
temp.head()

0     14627.0
1     13347.0
2     13603.0
3    112012.0
4     13987.0
dtype: float64

# COUNTS

In [None]:
to_count = train.columns[2:].tolist()

for c in to_count:
    s = train[c]
    if hasattr(s, 'cat'):
        s = s.cat.codes
    vc = s.value_counts(dropna=False)
    train[f'{c}_count'] = s.map(vc).astype(np.int32)

In [None]:
tran['TimeInDay'] = tran.TransactionDT % 86400
tran['Cents'] = tran.TransactionAmt % 1
tran['C_bin'] = encode_bits(tran[CCOLS]>0)
tran['D_bin'] = encode_bits(tran[DCOLS].isnull())
tran['M_bin'] = encode_bits(tran[MCOLS].isnull())
tran['addr_bin'] = encode_bits(tran[['addr1','addr2','dist1','dist2']].isnull())
tran['email_bin'] = encode_bits(tran[['R_emaildomain','P_emaildomain']].isnull())

# MDLP

In [None]:
#https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/43886

In [None]:
#https://github.com/hlin117/mdlp-discretization

In [20]:
feature_to_use = ['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'C1', 'C3', 'C5', 'C13', 'D1', 'D3', 'D4', 'D10',
                  'D15', 'V12', 'V14', 'V15', 'V19', 'V29', 'V35', 'V37', 'V39', 'V41', 'V48', 'V53', 'V55', 'V56', 'V61', 'V75', 'V77', 
                  'V78', 'V79', 'V80', 'V82', 'V86', 'V88', 'V95', 'V98', 'V99', 'V100', 'V104', 'V107', 'V108', 'V109', 'V110', 'V111',
                  'V112', 'V114', 'V115', 'V116', 'V117', 'V118', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V129', 'V130', 'V131', 
                  'V135', 'V136', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V291', 'V300', 'V303', 
                  'V305', 'V310', 'V311', 'V312', 'V313', 'V314', 'V319', 'V320', 'Transaction_dow', 'Transaction_hour', 'M6', 'card4', 
                  'card6', 'P_emaildomain', 'ProductCD']

# Null number Equals

In [None]:
null_equal = {}

In [None]:
null_equal['group1'] = ['D1', 'V281', 'V282', 'V283', 'V288', 'V289', 'V296', 'V300', 'V301', 'V313', 'V314', 'V315']

In [None]:
null_equal['group2'] = ['D11', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11']

In [None]:
null_equal['group3'] = ['M1', 'M2', 'M3']

In [None]:
null_equal['group4'] = ['M8', 'M9']

In [None]:
null_equal['group5'] = ['id_01', 'id_12']

In [None]:
null_equal['group6'] = ['id_15', 'id_35', 'id_36', 'id_37', 'id_38','id_11', 'id_28', 'id_29']

In [None]:
null_equal['group7'] = ['id_05', 'id_06']

In [None]:
null_equal['group8'] = ['D8', 'D9', 'id_09', 'id_10']

In [None]:
null_equal['group9'] = ['id_03', 'id_04']

In [None]:
"""
My guess is that D1/D2 are about "how many days have passed from the first transaction", 
D3 is responsible for "how many days have passed from the previous transaction"
"""

In [None]:
"""
This is not correct. D9 is hour/24. You can also say that it is hours passed
since transaction. D8 is days passed since last transaction and D9 is its decimal part.
"""

In [None]:
"""
Any ordered categorical variables we've already described as numerical 
(e.g. Cx, Dx, id-1 to id-11, etc), anything else are non-ordering categorical
"""

In [None]:
"""
id_14 Correct, it's timezone,
"""

In [None]:
"""
 Just adding a feature which is the merger of card1,2,3 and 4
 and then encoded improves model accuracy
"""

In [None]:
"""
Thanks Chris! I actually did some extra work, thanks to the great topic from @snovik1975
https://www.kaggle.com/c/ieee-fraud-detection/discussion/107791#latest-622119

The missing values actually are highly related to different ProductCD.
For example, if we look at V1 - V11, the missing values from W is 29.21%
However, for H/C/S/R, 100% are missing!

As you can check for other blocks, similar story will happen. Looks like the ProductCD is a key.
"""

In [None]:
"""
V-columns have lots of time-based structure in them and there are also 
non-linear interrelations among them. Any PCA-style dimensionality reduction
is at least pointless and quite probably harmful. And even worse - those features 
start their life before the beginning of the train set therefore they might 
have values which we can figure out from the train but also those which we cannot.
And this is not on the feature level but within the feature.
So go figure out why organizers made it so complex with these pre-calculated features
Puzzle makes me love and hate it, LOL. I'm now trying to split by ProductCD, 
and filter some columns first.
"""

In [None]:
"""
So if categorical feature is given as numerical, LigthGBM could miss to extract full potential 
information from it. On the other hand, if given as categorical, 
there is bigger risk of overfitting.
From what I've observed, it could be a good idea to try specifying high-cardinality
categorical features as numeric (as they typically have high overfitting risk).
"""

In [None]:
"""
Permutation importance for solo features
Recursive feature elimination for block of features
PCA for groups of identical features (V columns)
-- Didn't check adversarial validation (next thing to do)

As I see all approaches work. And features selection is one of the key points to boost score.
"""

In [None]:
"""
['V1', 'V10', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V12', 'V120',
'V121', 'V122', 'V123', 'V124', 'V125', 'V13', 'V130', 'V131', 'V135', 'V136', 'V137', 'V138', 'V139', 'V14', 'V140', 'V141', 'V142',
'V146', 'V147', 'V149', 'V152', 'V154', 'V158', 'V159', 'V161', 'V162', 'V165', 'V166', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174',
'V175', 'V176', 'V18', 'V180', 'V181', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V19', 'V190', 'V194', 'V195', 'V197',
'V198', 'V199', 'V2', 'V20', 'V200', 'V201', 'V205', 'V207', 'V208', 'V209', 'V210', 'V216', 'V22', 'V220', 'V221', 'V223', 'V224',
'V226', 'V227', 'V228', 'V229', 'V23', 'V230', 'V234', 'V235', 'V238', 'V239', 'V24', 'V240', 'V241', 'V242', 'V243', 'V245', 'V246',
'V247', 'V25', 'V250', 'V252', 'V253', 'V255', 'V257', 'V258', 'V259', 'V26', 'V260', 'V261', 'V262', 'V263', 'V264', 'V267', 'V268',
'V27', 'V271', 'V274', 'V277', 'V281', 'V282', 'V283', 'V284', 'V285', 'V286', 'V287', 'V288', 'V289', 'V290', 'V292', 'V297', 'V3', 
'V30', 'V300', 'V301', 'V302', 'V303', 'V305', 'V309', 'V310', 'V312', 'V313', 'V314', 'V315', 'V319', 'V320', 'V321', 'V325', 'V334',
'V335', 'V336', 'V337', 'V338', 'V339', 'V35', 'V36', 'V37', 'V38', 'V39', 'V4', 'V40', 'V41', 'V43', 'V44', 'V45', 'V46', 'V47', 'V49',
'V5', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V58', 'V6', 'V60', 'V61', 'V62', 'V64', 'V65', 'V66', 'V67', 'V68', 'V7', 'V70', 'V72', 
'V73', 'V75', 'V76', 'V77', 'V78', 'V79', 'V8', 'V80', 'V82', 'V83', 'V85', 'V86', 'V87', 'V88', 'V9', 'V90', 'V93', 'V98']
"""

In [None]:
"""
Negative downsampling
This approach is also good for current competition.
The idea behind it - our model should find anomalies among normal transactions but reduced amount of normal transactions don't affect anomalies.

It could speed up training process by 4-5 times without losing the quality. There is very low difference between sample (0.9489), full data (0.9496) and LB (0.9496) in my CV.

It gives us ability to check many different hypotheses fast and to find best features within short time.
When we find best features/parameters we could use full data to get maximum score.
"""

In [None]:
"""
Approx. 99.8% of the data obey the following formula in train and test for variables V126 to V137
V126 = V129 + V132 + V135
V127 = V130 + V133 + V136
V128 = V131 + V134 + V137
"""