# evaluate performance of LightGBM using selected features

## import libraries

In [5]:
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## define utility function to reduce memory usage

In [6]:
def reduce_mem_usage(df, verbose=True):
    """
    Reduce dataframe size

    params:
    - df: dataframe to reduce the size of

    return:
    - dataframe of reduced size
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'float128']
    start_mem = df.memory_usage().sum() / 1024**2    

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                elif c_min > np.finfo(np.float128).min and c_max < np.finfo(np.float128).max:
                    df[col] = df[col].astype(np.float128)
                    
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose: 
        print(
            'Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem
        ))

    return df

## list down useless features (known from feature selection)

In [7]:
useless_features = [
    'TransactionID',  # not really a feature
    'dist2',  # transaction features
    'C3',  # C features
    'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14',  # D features
    'M1',  # M features
    'id_07', 'id_08', 'id_18', 'id_21', 'id_22', 'id_23',  # id features
    'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_35',  # id features
    'V6', 'V8', 'V9', 'V10', 'V11', 'V14', 'V15', 'V16',  # V features
    'V18', 'V21', 'V22', 'V27', 'V28', 'V31', 'V32',  # V features
    'V41', 'V42', 'V46', 'V50', 'V51', 'V59', 'V65',  # V features
    'V68', 'V71', 'V72', 'V79', 'V80', 'V84', 'V85',  # V features
    'V88', 'V89', 'V92', 'V93', 'V95', 'V98', 'V101',  # V features
    'V104', 'V106', 'V107', 'V108', 'V109', 'V110',  # V features
    'V111', 'V112', 'V113', 'V114', 'V116', 'V117',  # V features
    'V118', 'V119', 'V120', 'V121', 'V122', 'V123',  # V features 
    'V125', 'V138', 'V141', 'V142', 'V144', 'V146',  # V features 
    'V147', 'V148', 'V151', 'V153', 'V154', 'V155',  # V features 
    'V157', 'V158', 'V159', 'V161', 'V163', 'V164',  # V features 
    'V166', 'V172', 'V173', 'V174', 'V175', 'V176',  # V features 
    'V177', 'V178', 'V179', 'V180', 'V181', 'V182',  # V features  
    'V183', 'V184', 'V185', 'V186', 'V190', 'V191',  # V features  
    'V192', 'V193', 'V194', 'V195', 'V196', 'V197',  # V features  
    'V198', 'V199', 'V214', 'V216', 'V220', 'V225',  # V features 
    'V226', 'V227', 'V230', 'V233', 'V235', 'V236',  # V features  
    'V237', 'V238', 'V239', 'V240', 'V241', 'V242',  # V features 
    'V244', 'V246', 'V247', 'V248', 'V249', 'V250',  # V features 
    'V252', 'V254', 'V255', 'V269', 'V276', 'V297',  # V features 
    'V300', 'V302', 'V304', 'V305', 'V325', 'V327',  # V features  
    'V328', 'V329', 'V330', 'V334', 'V335', 'V336',  # V features 
    'V337', 'V338', 'V339',  # V features 
]


## load training data

In [4]:
transaction_dataframe = pd.read_csv('data/train_transaction.csv')
transaction_dataframe = reduce_mem_usage(transaction_dataframe)
transaction_dataframe.head()

Mem. usage decreased to 542.35 Mb (69.4% reduction)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
identity_dataframe = pd.read_csv('data/train_identity.csv')
identity_dataframe = reduce_mem_usage(identity_dataframe)
identity_dataframe.head()

Mem. usage decreased to 25.86 Mb (42.7% reduction)


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [6]:
dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
dataframe.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## drop useless features

In [7]:
dataframe = dataframe.drop(useless_features, axis=1)
dataframe.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## preprocessing - disregard OS versions

In [8]:
for index, operating_system in dataframe.id_30.iteritems():
    if not isinstance(operating_system, str):
        operating_system = 'other'

    elif 'Android' in operating_system:
        operating_system = 'Android'
    
    elif 'iOS' in operating_system:
        operating_system = 'iOS'

    elif 'Mac OS X' in operating_system:
        operating_system = 'Mac OS X'

    elif 'Windows' in operating_system:
        operating_system = 'Windows'

    else:
        operating_system = 'other'

    dataframe.at[index, 'id_30'] = operating_system

print(dataframe.id_30.unique())

['other' 'Android' 'iOS' 'Mac OS X' 'Windows']


## preprocessing - disregard browser versions

In [9]:
browser_list = [
    'aol',
    'chrome',
    'chromium',
    'comodo',
    'cyberfox',
    'edge',
    'firefox',
    'icedragon',
    'ie',
    'iron',
    'maxthon',
    'opera',
    'palemoon',
    'puffin',
    'safari',
    'samsung',
    'seamonkey',
    'silk',
    'waterfox',
]

for index, browser in dataframe.id_31.iteritems():
    new_browser = 'other'

    if isinstance(browser, str):
        for known_browser in browser_list:
            if known_browser in browser:
                new_browser = known_browser
                break

    dataframe.at[index, 'id_31'] = new_browser

print(dataframe.id_31.unique())

['other' 'samsung' 'safari' 'chrome' 'edge' 'firefox' 'ie' 'opera' 'aol'
 'silk' 'waterfox' 'puffin' 'cyberfox' 'palemoon' 'maxthon' 'iron'
 'seamonkey' 'comodo' 'chromium' 'icedragon']


## do rest of preprocessing

- convert object columns to string columns
- imputation (for numbers, fill with interquartile mean)
- do label encoding for non-numeric values
- reduce memory usage again

In [10]:
le = LabelEncoder()

for column in dataframe.columns:
    if dataframe[column].dtype == 'object':
        dataframe[column]= dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])
    else:
        dataframe[column] = dataframe[column].fillna(dataframe[column].quantile().mean())

dataframe = reduce_mem_usage(dataframe)

Mem. usage decreased to 357.35 Mb (22.1% reduction)


## separate data into features and target variable

In [11]:
features_dataframe = dataframe.drop('isFraud', axis=1)
is_fraud_data = dataframe['isFraud']

## split into training and validation sets

In [12]:
train_features, val_features, train_target, val_target = train_test_split(
    features_dataframe, 
    is_fraud_data, 
    test_size=0.2,
)

## train LightGBM

- using (almost) default parameters

In [13]:
%%time

lgbm_classifier = LGBMClassifier(n_jobs=2)
lgbm = lgbm_classifier.fit(train_features, train_target, eval_metric='auc')

CPU times: user 1min 12s, sys: 314 ms, total: 1min 12s
Wall time: 37.4 s


## evaluate model

In [14]:
prediction = lgbm.predict(val_features)
roc_auc_score(val_target, prediction)

0.7219467870786597

## try following [reference notebook](https://www.kaggle.com/nroman/lgb-single-model-lb-0-9419)

In [15]:
params = {
    'num_leaves': 491,
    'min_child_weight': 0.03454472573214212,
    'feature_fraction': 0.3797454081646243,
    'bagging_fraction': 0.4181193142567742,
    'min_data_in_leaf': 106,
    'objective': 'binary',
    'max_depth': -1,
    'learning_rate': 0.006883242363721497,
    'boosting_type': 'gbdt',
    'bagging_seed': 11,
    'metric': 'auc',
    'verbosity': -1,
    'reg_alpha': 0.3899927210061127,
    'reg_lambda': 0.6485237330340494,
    'random_state': 47,
}

In [16]:
%%time

train_data = lgb.Dataset(train_features, train_target)
val_data = lgb.Dataset(val_features, val_target)

lgbm = lgb.train(
    params, 
    train_data, 
    10000, 
    valid_sets = [train_data, val_data],
    verbose_eval=1000,
    early_stopping_rounds=500,
)

prediction = lgbm.predict(val_features)
roc_auc_score(val_target, prediction)

Training until validation scores don't improve for 500 rounds
[1000]	training's auc: 0.993359	valid_1's auc: 0.96634
[2000]	training's auc: 0.999168	valid_1's auc: 0.972949
[3000]	training's auc: 0.999886	valid_1's auc: 0.974516
[4000]	training's auc: 0.99999	valid_1's auc: 0.97489
Early stopping, best iteration is:
[4310]	training's auc: 0.999996	valid_1's auc: 0.974925
CPU times: user 1h 13min 29s, sys: 6.58 s, total: 1h 13min 36s
Wall time: 36min 52s


0.9749247375395002

## save model

In [18]:
joblib.dump(lgbm, 'models/lgbm.joblib')

['models/lgbm.joblib']

## load test data

In [25]:
transaction_dataframe = pd.read_csv('data/test_transaction.csv')
transaction_dataframe = reduce_mem_usage(transaction_dataframe)
transaction_dataframe.head()

Mem. usage decreased to 472.59 Mb (68.9% reduction)


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3663549,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [15]:
identity_dataframe = pd.read_csv('data/test_identity.csv')
identity_dataframe = reduce_mem_usage(identity_dataframe)

identity_dataframe = identity_dataframe.rename(
    columns={
        column: column.replace('-', '_')
        for column in identity_dataframe.columns
    }
)

identity_dataframe.head()

Mem. usage decreased to 25.44 Mb (42.7% reduction)


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663586,-45.0,280290.0,,,0.0,0.0,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
1,3663588,0.0,3579.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 67.0 for android,24.0,1280x720,match_status:2,T,F,T,T,mobile,LGLS676 Build/MXB48T
2,3663597,-5.0,185210.0,,,1.0,0.0,,,,...,ie 11.0 for tablet,,,,F,T,T,F,desktop,Trident/7.0
3,3663601,-45.0,252944.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
4,3663602,-95.0,328680.0,,,7.0,-33.0,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,SM-G9650 Build/R16NW


In [16]:
dataframe = transaction_dataframe.merge(identity_dataframe, how='outer')
dataframe.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


## drop useless features

In [17]:
dataframe = dataframe.drop(useless_features, axis=1)
dataframe.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_30,id_31,id_32,id_33,id_34,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,


## preprocessing - disregard OS versions

In [18]:
for index, operating_system in dataframe.id_30.iteritems():
    if not isinstance(operating_system, str):
        operating_system = 'other'

    elif 'Android' in operating_system:
        operating_system = 'Android'
    
    elif 'iOS' in operating_system:
        operating_system = 'iOS'

    elif 'Mac OS X' in operating_system:
        operating_system = 'Mac OS X'

    elif 'Windows' in operating_system:
        operating_system = 'Windows'

    else:
        operating_system = 'other'

    dataframe.at[index, 'id_30'] = operating_system

print(dataframe.id_30.unique())

['other' 'Android' 'iOS' 'Windows' 'Mac OS X']


## preprocessing - disregard browser versions

In [19]:
browser_list = [
    'aol',
    'chrome',
    'chromium',
    'comodo',
    'cyberfox',
    'edge',
    'firefox',
    'icedragon',
    'ie',
    'iron',
    'maxthon',
    'opera',
    'palemoon',
    'puffin',
    'safari',
    'samsung',
    'seamonkey',
    'silk',
    'waterfox',
]

for index, browser in dataframe.id_31.iteritems():
    new_browser = 'other'

    if isinstance(browser, str):
        for known_browser in browser_list:
            if known_browser in browser:
                new_browser = known_browser
                break

    dataframe.at[index, 'id_31'] = new_browser

print(dataframe.id_31.unique())

['other' 'chrome' 'ie' 'safari' 'edge' 'firefox' 'samsung' 'opera'
 'palemoon']


## do rest of preprocessing

- convert object columns to string columns
- imputation (for numbers, fill with interquartile mean)
- do label encoding for non-numeric values
- reduce memory usage again

In [20]:
le = LabelEncoder()

for column in dataframe.columns:
    if dataframe[column].dtype == 'object':
        dataframe[column]= dataframe[column].astype(str)
        dataframe[column] = le.fit_transform(dataframe[column])
    else:
        dataframe[column] = dataframe[column].fillna(dataframe[column].quantile().mean())

dataframe = reduce_mem_usage(dataframe)

Mem. usage decreased to 315.73 Mb (21.6% reduction)


## load model

- if model is no longer in memory (e.g. due to restarting of notebook)

In [21]:
lgbm = joblib.load('models/lgbm.joblib')

## do inference on test data

In [22]:
predictions = lgbm.predict(dataframe)

## create output dataframe

In [37]:
output_dataframe = pd.DataFrame({
    'TransactionID': transaction_dataframe.TransactionID,
    'isFraud': pd.Series(predictions),
})

output_dataframe.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.00044
1,3663550,0.000164
2,3663551,0.000798
3,3663552,0.000867
4,3663553,0.000134


## save output as csv

In [38]:
output_dataframe.to_csv('output/lgbm_predictions.csv', index=False)

## Kaggle result

- score: 0.940812
- private leaderboard rank: 18
- private leaderboard ranking percentile: 99.717912552