In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc,os,sys
import re
import random

from sklearn.model_selection import KFold, cross_val_score, train_test_split
# from sklearn import metrics, preprocessing
from sklearn.model_selection import StratifiedKFold
# from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

In [71]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > df[col].astype(np.int64) and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        else:
                        df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df 

In [72]:
train_trn = pd.read_csv('train_transaction.csv', index_col='TransactionID')
test_trn = pd.read_csv('test_transaction.csv', index_col='TransactionID')

train_id = pd.read_csv('train_identity.csv', index_col='TransactionID')
test_id = pd.read_csv('test_identity.csv', index_col='TransactionID')

sub = pd.read_csv('sample_submission.csv')

In [151]:
train_trn.shape

(590540, 393)

In [73]:
train=pd.merge(train_trn,train_id, on ='TransactionID', how='left')
test = pd.merge(test_trn, test_id, on='TransactionID', how='left')

In [74]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 1955.37 MB
Memory usage after optimization is: 527.82 MB
Decreased by 73.0%
Memory usage of dataframe is 1673.87 MB
Memory usage after optimization is: 460.15 MB
Decreased by 72.5%


In [121]:
df_train = train.copy()
df_test = test.copy()

####  drop 80na

In [101]:
def drop_80na(df):
    null_percent = df.isna().sum()/df.shape[0]*100
    null_percent = pd.DataFrame({'Percent Missing':null_percent})
    cols_drop = null_percent.index[null_percent['Percent Missing'] > 80]
    df = df.drop(cols_drop, axis = 1)
    
    return df


In [122]:
df_train = drop_80na(df_train)
df_test = drop_80na(df_test)

#### null features

In [104]:
def null_col(df):
    df['nulls'] = train.isnull().sum(axis=1)
    
    return df

In [123]:
df_train = null_col(df_train)
df_test = null_col(df_test)

#### fill cat_cols with 'missing', num_cols with -1

In [107]:
def others_fill(df):
    # fill all cat with missing
    cat_cols = df.select_dtypes(include='category').columns
    df[cat_cols] = df[cat_cols].replace({ np.nan:'missing'})
    
    # fill all num with -1
    num_cols = df.select_dtypes(exclude = 'category').columns
    df[num_cols] = df[num_cols].replace({ np.nan:-1})
    
    return df
    

In [124]:
df_train = others_fill(df_train)
df_test = others_fill(df_test)

#### fill card 4&6

In [109]:
import random

In [110]:
def card46_fill(df):
    card4_filling = pd.DataFrame(df.card4.value_counts()/df.shape[0])
    card6_filling = pd.DataFrame(df.card6.value_counts()/df.shape[0])
    df.card4.fillna(random.choices(card4_filling.index,
                               k=1,
                               weights=card4_filling.card4)[0],
                               inplace = True)
    df.card6.fillna(random.choices(card6_filling.index,
                               k=1,
                               weights=card6_filling.card6)[0],
                               inplace = True)
    
    return df

In [125]:
df_train = card46_fill(df_train)
df_test = card46_fill(df_test)

#### deviceInfo

In [112]:
def device(df):
    # https://www.kaggle.com/viswajithkn/fraud-detection
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    
    return df


In [126]:
df_train = device(df_train)
df_test = device(df_test)

#### id_31

In [115]:
def id_31_browser(df):
    df.id_31 = df.id_31.str.replace(' ','')
    df.loc[df.id_31.str.contains('chrome'),'id_31'] = 'chrome'
    df.loc[df.id_31.str.contains('safari'),'id_31'] = 'safari'
    df.loc[df.id_31.str.contains('firefox'),'id_31'] = 'firefox'
    df.loc[df.id_31.str.contains('Firefox'),'id_31'] = 'firefox'
    df.loc[df.id_31.str.contains('samsung'),'id_31'] = 'samsungbrowser'
    df.loc[df.id_31.str.contains('Samsung'),'id_31'] = 'samsungbrowser'
    df.loc[df.id_31.str.contains('google'),'id_31'] = 'google'
    df.loc[df.id_31.str.contains('edge'),'id_31'] = 'edge'
    df.loc[df.id_31.str.contains('opera'),'id_31'] = 'opera'
    df.loc[df.id_31.str.contains('ie'),'id_31'] = 'ie'
    
    return df

In [127]:
df_train = id_31_browser(df_train)
df_test = id_31_browser(df_test)

In [132]:
df_train.columns

Index(['isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1',
       'card2', 'card3', 'card4', 'card5', 'card6',
       ...
       'D15_to_mean_addr1', 'D15_to_std_addr1', 'nulls', 'device_name',
       'TransactionAmt_Log', 'Transaction_h', 'id_02_to_mean_addr1',
       'id_02_to_std_addr1', 'addr1_card1', 'TransactionAmt_decimal'],
      dtype='object', length=383)

#### new features

In [129]:
def new_features(df):
    # log transform of TransactionAmt
    df['TransactionAmt_Log'] = np.log(df['TransactionAmt'])
    
    # hour of the day in which a transaction happened.
    df['Transaction_h']=np.floor(df['TransactionDT']/3600)%24
    
    columns_a = ['TransactionAmt', 'id_02', 'D15']
    columns_b = ['card1', 'card4', 'addr1']

    for col_a in columns_a:
        for col_b in columns_b:
                df[f'{col_a}_to_mean_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('mean')
                df[f'{col_a}_to_std_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('std')
    
    #addr1_card1
    df['addr1_card1'] = df['addr1'].astype(str) + '_' + df['card1'].astype(str)
    
    #TransactionAmt_decimal
    df['TransactionAmt_decimal'] = ((df['TransactionAmt'] - df['TransactionAmt'].astype(int)) * 1000).astype(int)
    
    return df

In [131]:
df_train = new_features(df_train)
df_test = new_features(df_test)

#### upsampling df_train

In [134]:
from sklearn.utils import resample

not_fraud=df_train[df_train.isFraud==0]
fraud=df_train[df_train.isFraud==1]

# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
train_upsampled = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
train_upsampled.isFraud.value_counts()

1    569877
0    569877
Name: isFraud, dtype: int64

In [135]:
train_upsampled.dtypes

isFraud                             int8
TransactionDT                      int32
TransactionAmt                   float16
ProductCD                       category
card1                              int16
card2                            float16
card3                            float16
card4                             object
card5                            float16
card6                             object
addr1                            float16
addr2                            float16
dist1                            float16
P_emaildomain                     object
R_emaildomain                     object
C1                               float16
C2                               float16
C3                               float16
C4                               float16
C5                               float16
C6                               float16
C7                               float16
C8                               float16
C9                               float16
C10             

In [62]:
train_upsampled.select_dtypes(include='object').columns

Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_28', 'id_29', 'id_31', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo', 'device_name'],
      dtype='object')

In [136]:
temp_cat = train_upsampled.select_dtypes(include='category').columns
temp_cat

Index(['ProductCD'], dtype='object')

In [137]:
# change the type of ['ProductCD', 'card4', 'card6']
train_upsampled.ProductCD = train_upsampled.ProductCD.astype('object')
# train_upsampled.card4 = train_upsampled.card4.astype('object')
# train_upsampled.card6 = train_upsampled.card6.astype('object')


In [138]:
%%time
from sklearn.preprocessing import LabelEncoder

for col in train_upsampled.columns:
    if train_upsampled[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train_upsampled[col].astype(str).values) + list(df_test[col].astype(str).values))
        train_upsampled[col] = le.transform(list(train_upsampled[col].astype(str).values))
        df_test[col] = le.transform(list(df_test[col].astype(str).values))

CPU times: user 31.5 s, sys: 3.29 s, total: 34.8 s
Wall time: 36.3 s


In [145]:
#https://www.kaggle.com/yw6916/lgb-xgb-ensemble-stacking-based-on-fea-eng
y_train = train_upsampled['isFraud'].copy()
X_train = train_upsampled.drop('isFraud', axis=1)
X_test = df_test.copy()

In [154]:
X_test = X_test.drop(['D6','D13','D14'],axis = 1)

In [148]:
print(y_train.shape)
print(X_train.shape)
print(X_test.shape)

(1139754,)
(1139754, 382)
(506691, 385)


In [149]:
xtrains = list(X_train.columns)
xtest = list(X_test.columns)

In [150]:
for i in xtest:
    if i not in xtrains:
        print(i)

D6
D13
D14


In [146]:
from sklearn.model_selection import TimeSeriesSplit,KFold
n_fold = 4
folds = KFold(n_splits=n_fold,shuffle=True)

print(folds)

KFold(n_splits=4, random_state=None, shuffle=True)


In [142]:
from sklearn.model_selection import KFold
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [160]:
lgb_submission=sub.copy()
lgb_submission['isFraud'] = 0
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    print(fold_n)
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)
    
    lgbclf = lgb.LGBMClassifier(
        num_leaves= 500,
        n_estimators=500,
        max_depth=-1,
        learning_rate=0.064,
        subsample=0.85,
        colsample_bytree=0.85,
        boosting_type= "gbdt",
        reg_alpha=0.38,
        reg_lamdba=0.65
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    lgbclf.fit(X_train_,y_train_)
    
    del X_train_,y_train_
    print('finish train')
    pred=lgbclf.predict_proba(X_test)[:,1]
    val=lgbclf.predict_proba(X_valid)[:,1]
    print('finish pred')
    del lgbclf, X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    lgb_submission['isFraud'] = lgb_submission['isFraud']+pred/n_fold
    del pred
    gc.collect()

0
finish train
finish pred
ROC accuracy: 0.9999280472147053
1
finish train
finish pred
ROC accuracy: 0.999957849368564
2
finish train
finish pred
ROC accuracy: 0.9999472624814836
3
finish train
finish pred
ROC accuracy: 0.9999437683530386


In [158]:
lgb_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000728
1,3663550,0.000291
2,3663551,0.000635
3,3663552,0.000735
4,3663553,0.000919


In [161]:
lgb_submission.to_csv('submission.csv', index=False)