In [1]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
import lightgbm as lgbm

import seaborn as sns
import matplotlib.pyplot as plt
import gc

In [2]:
%%time
train_transaction = pd.read_csv('input/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('input/test_transaction.csv', index_col='TransactionID')


train_identity = pd.read_csv('input/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('input/sample_submission.csv', index_col='TransactionID')

train_df = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test_df = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
    
print(train_df.shape)
print(test_df.shape)

train_label = train_df['isFraud'].copy()
train_df = train_df.drop('isFraud', axis=1)


(590540, 433)
(506691, 432)
CPU times: user 41.8 s, sys: 4.33 s, total: 46.1 s
Wall time: 60 s


In [3]:
train_df['online']=0
test_df['online']=0
train_df.loc[train_identity.index.values,'online']=1
test_df.loc[test_identity.index.values,'online']=1

train_df['missing']=train_df.isnull().sum(axis=1)
test_df['missing']=test_df.isnull().sum(axis=1)

train_df=train_df.fillna(-999)
test_df=test_df.fillna(-999)

train_df=train_df.reset_index(drop=True)
train_label=train_label.reset_index(drop=True)
test_df=test_df.reset_index(drop=True)

del train_transaction, train_identity, test_transaction, test_identity
gc.collect()

49

In [4]:
cards=[i for i in train_df.columns if i.startswith('card')]
addr=[i for i in train_df.columns if i.startswith('addr')]
dist=[i for i in train_df.columns if i.startswith('dist')]
emaildomain=[i for i in train_df.columns if i.endswith('emaildomain')]
_bin=[i+'_bin' for i in emaildomain]
_suffix=[i+'_suffix' for i in emaildomain]
C=[i for i in train_df.columns if i.startswith('C')]
D=[i for i in train_df.columns if i.startswith('D') and i[1]!='e']
M=[i for i in train_df.columns if i.startswith('M')]
V=[i for i in train_df.columns if i.startswith('V')]

id_=[i for i in train_df.columns if i.startswith('id')]


In [5]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'protonmail', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']


In [6]:
from tqdm import tqdm_notebook as tqdm
from itertools import combinations
def agg_feat(train_df,test_df,grp_cols):
    count=0
    new_name=""
    for c in grp_cols:
        new_name += c+'_'
    for c in grp_cols:
        if count==0:
            train_df[new_name] = train_df[c].astype(str)+'_'
            test_df[new_name] = test_df[c].astype(str)+'_'
            count+=1
        else:
            train_df[new_name] += train_df[c].astype(str)+'_'
            test_df[new_name] += test_df[c].astype(str)+'_'
            
    return train_df,test_df

#for f in tqdm(output):
#    if len(f)>1:
#        in_=[item for sublist in f for item in sublist]
#        #print(in_)
#        train_df,test_df=agg_feat(train_df,test_df,in_)
        


train_df,test_df=agg_feat(train_df,test_df,['C1','C14'])
train_df,test_df=agg_feat(train_df,test_df,['C13','C14'])
train_df,test_df=agg_feat(train_df,test_df,['C8','C14'])
train_df,test_df=agg_feat(train_df,test_df,['C1','C13'])
train_df,test_df=agg_feat(train_df,test_df,['V258','V201'])
train_df,test_df=agg_feat(train_df,test_df,['V201','V244'])
train_df,test_df=agg_feat(train_df,test_df,['V258','V244'])
train_df,test_df=agg_feat(train_df,test_df,['V258','V262'])
train_df,test_df=agg_feat(train_df,test_df,['V201','V244'])
train_df,test_df=agg_feat(train_df,test_df,['addr1','card1'])
train_df,test_df=agg_feat(train_df,test_df,['addr1','card2'])
train_df,test_df=agg_feat(train_df,test_df,['addr2','card1'])
train_df,test_df=agg_feat(train_df,test_df,['addr2','card1'])
train_df,test_df=agg_feat(train_df,test_df,['C13','V258'])
train_df,test_df=agg_feat(train_df,test_df,['C1','V258'])
train_df,test_df=agg_feat(train_df,test_df,['C1','V201'])
train_df,test_df=agg_feat(train_df,test_df,['C14','V294'])

input_ = [[i] for i in train_df.columns if i.startswith('card')]
output = sum([list(map(list, combinations(input_, i))) for i in range(len(input_) + 1)], [])
for f in tqdm(output):
    if len(f)>1:
        in_=[item for sublist in f for item in sublist]
        #print(in_)
        train_df,test_df=agg_feat(train_df,test_df,in_)

train_df,test_df=agg_feat(train_df,test_df,['TransactionAmt','ProductCD']+cards)
train_df,test_df=agg_feat(train_df,test_df,['ProductCD']+cards)        
train_df,test_df=agg_feat(train_df,test_df,['TransactionAmt','ProductCD'])
#train_df,test_df=agg_feat(train_df,test_df,cards)
train_df,test_df=agg_feat(train_df,test_df,addr)
train_df,test_df=agg_feat(train_df,test_df,cards+addr)
train_df,test_df=agg_feat(train_df,test_df,cards+addr+emaildomain)
train_df,test_df=agg_feat(train_df,test_df,cards+addr+emaildomain+['ProductCD']+cards)
train_df,test_df=agg_feat(train_df,test_df,emaildomain)

train_df,test_df=agg_feat(train_df,test_df,C)
train_df,test_df=agg_feat(train_df,test_df,M)
train_df,test_df=agg_feat(train_df,test_df,id_)

#https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
for c in ['P_emaildomain', 'R_emaildomain']:
    train_df[c + '_bin'] = train_df[c].map(emails)
    test_df[c + '_bin'] = test_df[c].map(emails)
    
    train_df[c + '_suffix'] = train_df[c].map(lambda x: str(x).split('.')[-1])
    test_df[c + '_suffix'] = test_df[c].map(lambda x: str(x).split('.')[-1])
    
    train_df[c + '_suffix'] = train_df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test_df[c + '_suffix'] = test_df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

train_df['email_domain_comp'] = (train_df['P_emaildomain'].values == train_df['R_emaildomain'].values).astype(int)
test_df['email_domain_comp'] = (test_df['P_emaildomain'].values == test_df['R_emaildomain'].values).astype(int)

train_df['email_domain_suffix_bin'] = (train_df['P_emaildomain_bin'].values == train_df['R_emaildomain_bin'].values).astype(int)
test_df['email_domain_suffix_bin'] = (test_df['P_emaildomain_bin'].values == test_df['R_emaildomain_bin'].values).astype(int)

train_df['email_domain_suffix_comp'] = (train_df['P_emaildomain_suffix'].values == train_df['R_emaildomain_suffix'].values).astype(int)
test_df['email_domain_suffix_comp'] = (test_df['P_emaildomain_suffix'].values == test_df['R_emaildomain_suffix'].values).astype(int)


HBox(children=(IntProgress(value=0, max=64), HTML(value='')))




In [7]:
# Label Encoding
from tqdm import tqdm_notebook as tqdm

for f in tqdm(train_df.columns):
    if train_df[f].dtype=='object' or test_df[f].dtype=='object': 
        tmp = pd.factorize(pd.concat([train_df[f],test_df[f]],axis=0), sort=False)[0].astype('int32')
        train_df[f]=tmp[:train_df.shape[0],]
        test_df[f]=tmp[train_df.shape[0]:,]


HBox(children=(IntProgress(value=0, max=524), HTML(value='')))




In [8]:
# add 1-way grouping count-based features
from tqdm import tqdm_notebook as tqdm
new=[i for i in train_df.columns if i.endswith('_')]
cols=list(cards+addr+emaildomain+dist+C+D+M+id_+_bin+['DeviceType','DeviceInfo','ProductCD']+new)
for col in tqdm(cols):
    d = pd.concat([train_df[col],test_df[col]],axis=0).value_counts(dropna=False).to_dict()
    train_df[col+"_c"] = train_df[col].apply(lambda x:d.get(x,-999))
    test_df[col+"_c"] = test_df[col].apply(lambda x:d.get(x,-999))
    del d
    gc.collect()

HBox(children=(IntProgress(value=0, max=176), HTML(value='')))




In [9]:
#add 2-way grouping count-based features
import itertools
from tqdm import tqdm_notebook as tqdm

#['new_TransactionAmt','C13'],['new_TransactionAmt','V258']
cols=[['C1','C14'],['C13','C14'],['addr1','card1'],['card1','card2'],['C14','C8'],['C13','V258'],['C1','V258'],['V189','V258'],['C1','V201'],["C1","C13"],['V258','V262'],['C14','V294'],['V258','V244'],['V258','V201'],['V201','V244'],['V225','V258'],['V258','V294']]
#cols=C+addr+cards+['V258','V189','V244','V201','V294']
#cols=list(itertools.combinations(cols, 2))
for col in tqdm(cols):
    #col=list(col)
    tmp=pd.concat([train_df[col],test_df[col]],axis=0).groupby(by=col)[col[0]].transform('count')
    train_df[col[0]+col[1]+'_c']=tmp.iloc[0:train_df.shape[0],]
    test_df[col[0]+col[1]+'_c']=tmp.iloc[train_df.shape[0]:,]
    del tmp
    gc.collect()

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




In [10]:
#add 3-way grouping count-based features
from tqdm import tqdm_notebook as tqdm

cols=[['C1','C14','V201'],['C1','V189','V258'],['C14','C8','V294'],['C1','V201','V244'],['C1','V130','V258'],['C14','C8','D2'],['V189','V258','V294'],['C1','V201','V209'],['C14','V201','V294']]
for col in tqdm(cols):
    tmp=pd.concat([train_df[col],test_df[col]],axis=0).groupby(by=col)[col[0]].transform('count')
    train_df[col[0]+col[1]+col[2]+'_c']=tmp.iloc[0:train_df.shape[0],]
    test_df[col[0]+col[1]+col[2]+'_c']=tmp.iloc[train_df.shape[0]:,]
    del tmp
    gc.collect()

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [11]:
col=['TransactionAmt','card1','card4']
cnt=pd.concat([train_df[col],test_df[col]],axis=0)
cnt['TransactionAmt_to_mean_card1'] = cnt['TransactionAmt'] / cnt.groupby(['card1'])['TransactionAmt'].transform('mean')
cnt['TransactionAmt_to_mean_card4'] = cnt['TransactionAmt'] / cnt.groupby(['card4'])['TransactionAmt'].transform('mean')
cnt['TransactionAmt_to_std_card1'] = cnt['TransactionAmt'] / cnt.groupby(['card1'])['TransactionAmt'].transform('std')
cnt['TransactionAmt_to_std_card4'] = cnt['TransactionAmt'] / cnt.groupby(['card4'])['TransactionAmt'].transform('std')

train_df['TransactionAmt_to_mean_card1']=cnt['TransactionAmt_to_mean_card1'].iloc[0:train_df.shape[0],]
test_df['TransactionAmt_to_mean_card1']=cnt['TransactionAmt_to_mean_card1'].iloc[train_df.shape[0]:,]

train_df['TransactionAmt_to_mean_card4']=cnt['TransactionAmt_to_mean_card4'].iloc[0:train_df.shape[0],]
test_df['TransactionAmt_to_mean_card4']=cnt['TransactionAmt_to_mean_card4'].iloc[train_df.shape[0]:,]

train_df['TransactionAmt_to_std_card1']=cnt['TransactionAmt_to_std_card1'].iloc[0:train_df.shape[0],]
test_df['TransactionAmt_to_std_card1']=cnt['TransactionAmt_to_std_card1'].iloc[train_df.shape[0]:,]

train_df['TransactionAmt_to_std_card4']=cnt['TransactionAmt_to_std_card4'].iloc[0:train_df.shape[0],]
test_df['TransactionAmt_to_std_card4']=cnt['TransactionAmt_to_std_card4'].iloc[train_df.shape[0]:,]

del cnt
gc.collect()

col=['id_02','card1','card4']
cnt=pd.concat([train_df[col],test_df[col]],axis=0)
cnt['id_02_to_mean_card1'] = cnt['id_02'] / cnt.groupby(['card1'])['id_02'].transform('mean')
cnt['id_02_to_mean_card4'] = cnt['id_02'] / cnt.groupby(['card4'])['id_02'].transform('mean')
cnt['id_02_to_std_card1'] = cnt['id_02'] / cnt.groupby(['card1'])['id_02'].transform('std')
cnt['id_02_to_std_card4'] = cnt['id_02'] / cnt.groupby(['card4'])['id_02'].transform('std')

train_df['id_02_to_mean_card1']=cnt['id_02_to_mean_card1'].iloc[0:train_df.shape[0],]
test_df['id_02_to_mean_card1']=cnt['id_02_to_mean_card1'].iloc[train_df.shape[0]:,]

train_df['id_02_to_mean_card4']=cnt['id_02_to_mean_card4'].iloc[0:train_df.shape[0],]
test_df['id_02_to_mean_card4']=cnt['id_02_to_mean_card4'].iloc[train_df.shape[0]:,]

train_df['id_02_to_std_card1']=cnt['id_02_to_std_card1'].iloc[0:train_df.shape[0],]
test_df['id_02_to_std_card1']=cnt['id_02_to_std_card1'].iloc[train_df.shape[0]:,]

train_df['id_02_to_std_card4']=cnt['id_02_to_std_card4'].iloc[0:train_df.shape[0],]
test_df['id_02_to_std_card4']=cnt['id_02_to_std_card4'].iloc[train_df.shape[0]:,]

del cnt
gc.collect()

col=['D15','card1','card4']
cnt=pd.concat([train_df[col],test_df[col]],axis=0)
cnt['D15_to_mean_card1'] = cnt['D15'] / cnt.groupby(['card1'])['D15'].transform('mean')
cnt['D15_to_mean_card4'] = cnt['D15'] / cnt.groupby(['card4'])['D15'].transform('mean')
cnt['D15_to_std_card1'] = cnt['D15'] / cnt.groupby(['card1'])['D15'].transform('std')
cnt['D15_to_std_card4'] = cnt['D15'] / cnt.groupby(['card4'])['D15'].transform('std')

train_df['D15_to_mean_card1']=cnt['D15_to_mean_card1'].iloc[0:train_df.shape[0],]
test_df['D15_to_mean_card1']=cnt['D15_to_mean_card1'].iloc[train_df.shape[0]:,]

train_df['D15_to_mean_card4']=cnt['D15_to_mean_card4'].iloc[0:train_df.shape[0],]
test_df['D15_to_mean_card4']=cnt['D15_to_mean_card4'].iloc[train_df.shape[0]:,]

train_df['D15_to_std_card1']=cnt['D15_to_std_card1'].iloc[0:train_df.shape[0],]
test_df['D15_to_std_card1']=cnt['D15_to_std_card1'].iloc[train_df.shape[0]:,]

train_df['D15_to_std_card4']=cnt['D15_to_std_card4'].iloc[0:train_df.shape[0],]
test_df['D15_to_std_card4']=cnt['D15_to_std_card4'].iloc[train_df.shape[0]:,]

del cnt
gc.collect()

col=['D15','addr1','card4']
cnt=pd.concat([train_df[col],test_df[col]],axis=0)
cnt['D15_to_mean_addr1'] = cnt['D15'] / cnt.groupby(['addr1'])['D15'].transform('mean')
cnt['D15_to_mean_card4'] = cnt['D15'] / cnt.groupby(['card4'])['D15'].transform('mean')
cnt['D15_to_std_addr1'] = cnt['D15'] / cnt.groupby(['addr1'])['D15'].transform('std')
cnt['D15_to_std_card4'] = cnt['D15'] / cnt.groupby(['card4'])['D15'].transform('std')

train_df['D15_to_mean_addr1']=cnt['D15_to_mean_addr1'].iloc[0:train_df.shape[0],]
test_df['D15_to_mean_addr1']=cnt['D15_to_mean_addr1'].iloc[train_df.shape[0]:,]

train_df['D15_to_mean_card4']=cnt['D15_to_mean_card4'].iloc[0:train_df.shape[0],]
test_df['D15_to_mean_card4']=cnt['D15_to_mean_card4'].iloc[train_df.shape[0]:,]

train_df['D15_to_std_addr1']=cnt['D15_to_std_addr1'].iloc[0:train_df.shape[0],]
test_df['D15_to_std_addr1']=cnt['D15_to_std_addr1'].iloc[train_df.shape[0]:,]

train_df['D15_to_std_card4']=cnt['D15_to_std_card4'].iloc[0:train_df.shape[0],]
test_df['D15_to_std_card4']=cnt['D15_to_std_card4'].iloc[train_df.shape[0]:,]

del cnt
gc.collect()


63

In [12]:
# Log1p transformation for TransactionAmt
log_cols=['TransactionAmt']+dist
train_df[log_cols]=np.log1p(train_df[log_cols])
test_df[log_cols]=np.log1p(test_df[log_cols])

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [13]:
def grp_agg(train_df,test_df,agg,grpby):
    d = pd.concat([train_df[agg+grpby],test_df[agg+grpby]],axis=0).groupby(grpby).mean().to_dict()
    train_df['mean'+str(grpby[0])] = train_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    test_df["mean"+str(grpby[0])] = test_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    
    d = pd.concat([train_df[agg+grpby],test_df[agg+grpby]],axis=0).groupby(grpby).sum().to_dict()
    train_df['sum'+str(grpby[0])] = train_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    test_df["sum"+str(grpby[0])] = test_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))

    d = pd.concat([train_df[agg+grpby],test_df[agg+grpby]],axis=0).groupby(grpby).min().to_dict()
    train_df['min'+str(grpby[0])] = train_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    test_df["min"+str(grpby[0])] = test_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))

    d = pd.concat([train_df[agg+grpby],test_df[agg+grpby]],axis=0).groupby(grpby).max().to_dict()
    train_df['max'+str(grpby[0])] = train_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    test_df["max"+str(grpby[0])] = test_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))

    d = pd.concat([train_df[agg+grpby],test_df[agg+grpby]],axis=0).groupby(grpby).std().to_dict()
    train_df['std'+str(grpby[0])] = train_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    test_df["std"+str(grpby[0])] = test_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))

    d = pd.concat([train_df[agg+grpby],test_df[agg+grpby]],axis=0).groupby(grpby).median().to_dict()
    train_df['median'+str(grpby[0])] = train_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    test_df["median"+str(grpby[0])] = test_df[grpby[0]].apply(lambda x:d[agg[0]].get(x,-999))
    
    return train_df,test_df



train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['card1'])
train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['card2'])
#train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['card3'])
train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['card1_card2_card3_card4_card5_card6_'])

train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['addr1'])
train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['addr2'])
train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['addr1_addr2_'])

train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['P_emaildomain'])
train_df,test_df=grp_agg(train_df,test_df,['TransactionAmt'],['R_emaildomain'])

train_df,test_df=grp_agg(train_df,test_df,['C14'],['C1'])
train_df,test_df=grp_agg(train_df,test_df,['C14'],['C13'])

In [14]:
import datetime
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
train_df['TransactionDT'] = train_df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

#train_df['year'] = train_df['TransactionDT'].dt.year
#train_df['month'] = train_df['TransactionDT_new'].dt.month
#train_df['week'] = train_df['TransactionDT'].dt.week
train_df['dow'] = train_df['TransactionDT'].dt.dayofweek
train_df['hour'] = train_df['TransactionDT'].dt.hour
train_df['day'] = train_df['TransactionDT'].dt.day
train_df['wom'] = train_df['TransactionDT'].apply(lambda d: (d.day-1) // 7 + 1)

test_df['TransactionDT'] = test_df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

#test_df['year'] = test_df['TransactionDT'].dt.year
#test_df['month'] = test_df['TransactionDT_new'].dt.month
#test_df['week'] = test_df['TransactionDT'].dt.week
test_df['dow'] = test_df['TransactionDT'].dt.dayofweek
test_df['hour'] = test_df['TransactionDT'].dt.hour
test_df['day'] = test_df['TransactionDT'].dt.day
test_df['wom'] = test_df['TransactionDT'].apply(lambda d: (d.day-1) // 7 + 1)

In [15]:
train_df.drop(['TransactionDT'],axis=1,inplace=True)
test_df.drop(['TransactionDT'],axis=1,inplace=True)

In [16]:
#Delete some columns
cols = ['V300', 'V309', 'V111', 'V124', 'V106', 'V125', 'V315', 'V134', 'V102', 'V123', 'V316', 'V113', 'V136', 'V305',
        'V110', 'V299', 'V289', 'V286', 'V318', 'V103', 'V304', 'V116', 'V298', 'V284', 'V293', 'V137', 'V295', 'V301',
        'V104', 'V311', 'V115', 'V109', 'V119', 'V321', 'V114', 'V133', 'V122', 'V319', 'V105', 'V112', 'V118', 'V117',
        'V121', 'V108', 'V135', 'V320', 'V303', 'V297', 'V120']

one_value_cols = [col for col in train_df.columns if train_df[col].nunique() <= 1]
one_value_cols_test_df = [col for col in test_df.columns if test_df[col].nunique() <= 1]
print(one_value_cols == one_value_cols_test_df)

many_null_cols = [col for col in train_df.columns if train_df[col].isnull().sum() / train_df.shape[0] > 0.9]
many_null_cols_test_df = [col for col in test_df.columns if test_df[col].isnull().sum() / test_df.shape[0] > 0.9]
big_top_value_cols = [col for col in train_df.columns if train_df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test_df = [col for col in test_df.columns if test_df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

cols_to_drop = list(set(cols+many_null_cols + many_null_cols_test_df +
                        big_top_value_cols +
                        big_top_value_cols_test_df +
                        one_value_cols+ one_value_cols_test_df))
print(len(cols_to_drop))

train_df = train_df.drop(cols_to_drop, axis=1)
test_df = test_df.drop(cols_to_drop, axis=1)

False
100


In [17]:
train_df.shape,test_df.shape

((590540, 703), (506691, 703))

In [18]:
# One hot encode categorical features
from scipy import sparse as ssp
from sklearn.preprocessing import OneHotEncoder

cat_features=addr+emaildomain+_bin+_suffix+cards+M+['ProductCD','DeviceType','DeviceInfo']#+id_
print(len(cat_features))
rest_feats=[i for i in train_df.columns if i not in cat_features]

enc = OneHotEncoder(categories='auto')
enc.fit(list(train_df[cat_features].values)+list(test_df[cat_features].values))
X_cat = enc.transform(train_df[cat_features])
X_t_cat = enc.transform(test_df[cat_features])

train_list = [train_df[rest_feats].values,X_cat,]
test_list = [test_df[rest_feats].values,X_t_cat,]

train_df = ssp.hstack(train_list).tocsr()
test_df = ssp.hstack(test_list).tocsr()

del X_cat,X_t_cat,train_list,test_list,enc
gc.collect()

26


55

In [19]:
"""
LightGBM classifier with bagging
"""

import lightgbm as lgbm
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import KFold,StratifiedKFold

bootstrap=True
subsample=1
shuffle=True
test_id=test_df.shape[0]
num_boost_round=20000

final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(test_id)

NFOLDS = 5

random_seeds=[5736]#,45,123,9632]
num_bagg=len(random_seeds)
for s in range(0,num_bagg):
    kfold = KFold(NFOLDS, shuffle=True, random_state=random_seeds[s])
    #ix = np.random.choice(len(train_label), int(subsample * len(train_label)), bootstrap)
    #if not shuffle:
    #    ix=np.sort(ix)
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(test_id)
    
    params = {
    'n_jobs' : -1 ,
    'device': 'cpu', 
    "objective": "binary",
    'metric': "auc",#'binary_logloss',
    "boosting_type": "gbdt", 
    'learning_rate': 0.01,
    #'max_depth': 10,
    'num_leaves': 2**8-1,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.2,
    'bagging_freq': 1,
    #'boost_from_average':'false',
    #'min_child_samples': 50,
    #'min_child_weight': 100.0,
    #'min_split_gain': 0.1, 
    #'lambda_l1':0.05,
    #'lambda_l2':0.05,
    #"max_bin": 64,
    "seed":random_seeds[s]+1,
    'verbosity': 1
    }
    

    kf = kfold.split(train_df, train_label)
    #train_df=pd.concant([train_df,scaleddf_train],axis=1)
    best_trees = []
    fold_scores = []
    split=int(0.8*len(train_label))
    print(split)
    for i, (train_fold, validate) in enumerate(kf):
        if i!=4:
            continue
        print("\n\nFold #:",i+1)
        #train_fold=np.random.permutation(np.concatenate((train_fold,index_zero)))
        
        X_train, X_validate, label_train, label_validate = train_df[:split, :], train_df[split:, :], train_label[:split], train_label[split:]
        dtrain = lgbm.Dataset(X_train, label_train)
        
        del X_train,label_train
        gc.collect()
        
        #Train the model
        dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
        bst = lgbm.train(params, dtrain,valid_sets=[dtrain,dvalid],valid_names=['train','valid'],num_boost_round=num_boost_round, verbose_eval=100, early_stopping_rounds=100)
        best_trees.append(bst.best_iteration)
        
        #Predict on validation set
        cv_train[split:] += bst.predict(X_validate,num_iteration=bst.best_iteration)
        score = roc_auc_score(label_validate, cv_train[split:])
        print ("Score for each fold",score)
        fold_scores.append(score)
    
        # Free some memory
        del dtrain,dvalid, X_validate, label_validate
        gc.collect()
        
        # Predict on Test set
        #cv_pred += bst.predict(test_df, num_iteration=bst.best_iteration)

    cv_pred /= NFOLDS
    final_cv_train += cv_train
    final_cv_pred += cv_pred

    print("\n\nstr: ",np.array(fold_scores).std(), "fold scores: ",fold_scores)
    print("cv score: ", roc_auc_score(train_label, cv_train))
    print("current score: ", roc_auc_score(train_label, final_cv_train / (s+1)), s+1)

472432


Fold #: 5
Training until validation scores don't improve for 100 rounds.
[100]	train's auc: 0.942502	valid's auc: 0.897483
[200]	train's auc: 0.967373	valid's auc: 0.91138
[300]	train's auc: 0.980742	valid's auc: 0.920846
[400]	train's auc: 0.98854	valid's auc: 0.927133
[500]	train's auc: 0.993174	valid's auc: 0.930932
[600]	train's auc: 0.996003	valid's auc: 0.933215
[700]	train's auc: 0.997692	valid's auc: 0.934729
[800]	train's auc: 0.998625	valid's auc: 0.935738
[900]	train's auc: 0.999162	valid's auc: 0.936576
[1000]	train's auc: 0.999496	valid's auc: 0.937115
[1100]	train's auc: 0.999692	valid's auc: 0.937489
[1200]	train's auc: 0.999804	valid's auc: 0.93769
[1300]	train's auc: 0.999879	valid's auc: 0.937608
Early stopping, best iteration is:
[1223]	train's auc: 0.999825	valid's auc: 0.937765
Score for each fold 0.9377646810479559


str:  0.0 fold scores:  [0.9377646810479559]
cv score:  0.5155101204484595
current score:  0.5155101204484595 1
