# Load raw data and train LGBM model

* Load the original / raw Ember feature dataset.

* Split the data for multi-fold cross-validation.

* Train LGBM model and predict label probability for validation data.

In [1]:
# coding: utf-8

import os,sys
ROOT_PATH = os.path.dirname(os.getcwd())
sys.path.insert(0,ROOT_PATH)

In [2]:
import numpy as np
from numpy.random import random,randint

from inspect import currentframe, getframeinfo
from utils import pickle_store, pickle_load, debug_print_tensor

import ember

from ijcai import EMBER_LGBM

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import timeit
from tqdm import tqdm



In [3]:
# configuration parameters
N_TRAINING_SAMPLES_TOTAL = 3000
N_RAW_FEATURES           = None
MODULE_NAME              = "EMBER"
CONFIG_FILE              = "prepare_data.toml"
DATA_DIR                 = '/media/data/ijcai_2021/'
EMBER_DIR                = '/media/data/ember/ember_2018/'
verbose                  = True

## Step #1: load the raw data

We first load the original data in EMBER feature format

In [4]:
feature_version = 2
subset = None

extractor = ember.PEFeatureExtractor(feature_version)
n_features = extractor.dim
X_train    = None
y_train    = None
X_test     = None
y_test     = None
n_train    = None
n_test     = None

if subset is None or subset == "train":
    X_train_path = os.path.join(EMBER_DIR, "X_train.dat")
    y_train_path = os.path.join(EMBER_DIR, "y_train.dat")
    y_train = np.memmap(y_train_path, dtype=np.float32, mode="r")
    n_train_y = y_train.shape[0]
    X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(n_train_y, n_features))
    n_train = X_train.shape[0]
    assert(n_train==n_train_y)
    idx_train = (y_train!=-1)
    X_train = X_train[idx_train,:]
    y_train = y_train[idx_train]
    (n_train,n_features_train) = X_train.shape
    assert(n_features==n_features_train)

if subset is None or subset == "test":
    X_test_path = os.path.join(EMBER_DIR, "X_test.dat")
    y_test_path = os.path.join(EMBER_DIR, "y_test.dat")
    y_test = np.memmap(y_test_path, dtype=np.float32, mode="r")
    n_test_y = y_test.shape[0]
    X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(n_test_y, n_features))
    n_test = X_test.shape[0]
    assert(n_test==n_test_y)
    idx_test = (y_test!=-1)
    X_test = X_test[idx_test,:]
    y_test = y_test[idx_test]
    (n_test,n_features_test) = X_test.shape
    assert(n_features==n_features_test)

print("X_train (type {:20}): {} samples, {} features".format(str(type(X_train)),X_train.shape[0],X_train.shape[1]))
print("y_train (type {:20}): {} samples".format(str(type(y_train)),y_train.shape[0]))
print("X_test  (type {:20}): {} samples, {} features".format(str(type(X_test)),X_test.shape[0],X_test.shape[1]))
print("y_test  (type {:20}): {} samples".format(str(type(X_test)),y_test.shape[0]))

print("# of features        : {}".format(n_features))
print("# of training samples: {}, down from {}".format(n_train,n_train_y))
print("# of test samples    : {}, down from {}".format(n_test,n_test_y))


X_train (type <class 'numpy.ndarray'>): 600000 samples, 2381 features
y_train (type <class 'numpy.ndarray'>): 600000 samples
X_test  (type <class 'numpy.ndarray'>): 200000 samples, 2381 features
y_test  (type <class 'numpy.ndarray'>): 200000 samples
# of features        : 2381
# of training samples: 600000, down from 800000
# of test samples    : 200000, down from 200000


## Step #2: trim the raw data

Downsize the original data.

In [5]:
# downsize the trainig-validation data samples
if N_TRAINING_SAMPLES_TOTAL: n_train_samples   = N_TRAINING_SAMPLES_TOTAL
else:                        n_train_samples   = n_train
if N_RAW_FEATURES          : n_train_features  = N_RAW_FEATURES
else:                        n_train_features  = n_features

# pick rules and samples randomly
if n_train_samples==n_train:
    idx_samples = np.asarray(range(n_train))
else:
    idx_samples = np.random.choice(np.asarray(range(n_train)),n_train_samples,replace=False)
if n_train_samples==n_train:
    idx_features = np.asarray(range(n_features))
else:
    idx_features = np.random.choice(np.asarray(range(n_features)),n_train_features,replace=False)

X_ = X_train[idx_samples,:][:,idx_features]
y_ = y_train[idx_samples]
print("X_.shape = {}".format(X_.shape))
print("y_.shape = {}".format(y_.shape))

X_.shape = (3000, 2381)
y_.shape = (3000,)


## Step #3: split the data for cross validation

Split the data for multi-fold cross validation

In [6]:
# set split conditions
n_splits = 5
skf   = StratifiedKFold(n_splits=n_splits).split(X_,y_)

# split data
split_index = {'train':[],'validation':[]}
for train_idx, validation_idx in skf:
    split_index['train'].append(train_idx)
    split_index['validation'].append(validation_idx)

In [7]:
# store the data samples
data = {'train':{'X':[],'y':[]},
        'validation' :{'X':[],'y':[]}}

for i in range(n_splits):
    # select data
    train_idx      = split_index['train'][i]
    X_train        = X_[train_idx,:]
    y_train        = y_[train_idx]
    validation_idx = split_index['validation'][i]
    X_validation   = X_[validation_idx,:]
    y_validation   = y_[validation_idx]
    
    # set data
    data['train']['X'].append(X_train)
    data['train']['y'].append(y_train)
    data['validation']['X'].append(X_validation)
    data['validation']['y'].append(y_validation)

# store the converted data
N_train = data['train']['X'][i].shape[0]
N_test  = data['validation']['X'][i].shape[0]
N       = N_train + N_test
K       = data['train']['X'][i].shape[1]
print("converted data:")
print("    N_train = {}".format(N_train))
print("    N_test  = {}".format(N_test))
print("    N       = {}".format(N))
print("    K       = {}".format(K))
prefix = 'raw_data_N={}_K={}'.format(N,K)
if pickle_store(data,directory=DATA_DIR,prefix=prefix,verbose=verbose):
    print('successfully stored data.')

converted data:
    N_train = 2400
    N_test  = 600
    N       = 3000
    K       = 2381
save pickle file to /media/data/ijcai_2021//raw_data_N=3000_K=2381_2021-01-18_22:48:46.pkl
successfully stored data.


## Step #4: train LGBM and store booster models

Train LGBM models "boosters" for all the cross-validation training datasets.

In [8]:
# load the data
data        = pickle_load(directory=DATA_DIR,prefix=prefix,verbose=verbose)

data_dir = /media/data/ijcai_2021/
Load pickle file from /media/data/ijcai_2021/raw_data_N=3000_K=2381_2021-01-18_22:48:46.pkl


In [9]:
# initialize the dict
results = {'booster':[],'y_prob':[],'y_test':[],'time(sec)':[]}

# learn the models and derive the label probabilities for the test data
for i in tqdm(range(n_splits)):
    # load data
    X_train = data['train']['X'][i]
    y_train = data['train']['y'][i]
    X_test  = data['validation']['X'][i]
    y_test  = data['validation']['y'][i]

    # set the start time
    starttime = timeit.default_timer()

    # run model training and test
    model  = EMBER_LGBM(module_name=MODULE_NAME+'_LGBM', config_file=CONFIG_FILE)
    model  = model.fit(X_train,y_train)
    y_prob = model.predict_proba(X_test)
    
    # reshape the y_test to (N_test,2), i.e. label vector per category (in this case two categories)
    N_test = y_test.shape[0]
    y_test = y_test.reshape([N_test,1])
    y_test = np.concatenate(((1.0-y_test),y_test),axis=1)

    #store the results
    results['booster'].append(model.booster)
    results['y_prob'].append(y_prob)
    results['y_test'].append(y_test)    
    results['time(sec)'].append(timeit.default_timer() - starttime)

    
# set parameters
N_total      = X_train.shape[0] + X_test.shape[0]  # the sizes are the same for all cross-validation fold
num_leaves   = model.params['num_leaves']['args']
n_estimators = model.params['n_estimators']['args']

# store the trace file
if pickle_store(results,directory=DATA_DIR,module_name='ijcai_2021',prefix='LGBM_N={}_E={}_L={}'.format(N_total,n_estimators,num_leaves),verbose=verbose):
    print('experiment results stored successfully')



{'ember_datadir': '/media/data/ember/ember_2018/', 'featureversion': 2, 'force_create_vec': False, 'optimize': False, 'modelpath': 'model.txt', 'evals_result': 'None', 'boosting': 'gbdt', 'objective': 'binary', 'learning_rate': 0.05, 'num_threads': 0, 'seed': 'None', 'num_leaves': 30, 'max_depth': -1, 'n_estimators': 10, 'min_data_in_leaf': 20, 'feature_fraction': 0.5}
[LightGBM] [Info] Number of positive: 1219, number of negative: 1181
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 169452
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 1976
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.507917 -> initscore=0.031669
[LightGBM] [Info] Start training from score 0.031669


 20%|██        | 1/5 [00:00<00:03,  1.06it/s]

{'ember_datadir': '/media/data/ember/ember_2018/', 'featureversion': 2, 'force_create_vec': False, 'optimize': False, 'modelpath': 'model.txt', 'evals_result': 'None', 'boosting': 'gbdt', 'objective': 'binary', 'learning_rate': 0.05, 'num_threads': 0, 'seed': 'None', 'num_leaves': 30, 'max_depth': -1, 'n_estimators': 10, 'min_data_in_leaf': 20, 'feature_fraction': 0.5}
[LightGBM] [Info] Number of positive: 1219, number of negative: 1181
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 169545
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 1984
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.507917 -> initscore=0.031669
[LightGBM] [Info] Start training from score 0.031669


 40%|████      | 2/5 [00:01<00:02,  1.32it/s]

{'ember_datadir': '/media/data/ember/ember_2018/', 'featureversion': 2, 'force_create_vec': False, 'optimize': False, 'modelpath': 'model.txt', 'evals_result': 'None', 'boosting': 'gbdt', 'objective': 'binary', 'learning_rate': 0.05, 'num_threads': 0, 'seed': 'None', 'num_leaves': 30, 'max_depth': -1, 'n_estimators': 10, 'min_data_in_leaf': 20, 'feature_fraction': 0.5}
[LightGBM] [Info] Number of positive: 1219, number of negative: 1181
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 169876
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 1986
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.507917 -> initscore=0.031669
[LightGBM] [Info] Start training from score 0.031669


 60%|██████    | 3/5 [00:02<00:01,  1.44it/s]

{'ember_datadir': '/media/data/ember/ember_2018/', 'featureversion': 2, 'force_create_vec': False, 'optimize': False, 'modelpath': 'model.txt', 'evals_result': 'None', 'boosting': 'gbdt', 'objective': 'binary', 'learning_rate': 0.05, 'num_threads': 0, 'seed': 'None', 'num_leaves': 30, 'max_depth': -1, 'n_estimators': 10, 'min_data_in_leaf': 20, 'feature_fraction': 0.5}
[LightGBM] [Info] Number of positive: 1219, number of negative: 1181
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 169937
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 1990
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.507917 -> initscore=0.031669
[LightGBM] [Info] Start training from score 0.031669


 80%|████████  | 4/5 [00:02<00:00,  1.55it/s]

{'ember_datadir': '/media/data/ember/ember_2018/', 'featureversion': 2, 'force_create_vec': False, 'optimize': False, 'modelpath': 'model.txt', 'evals_result': 'None', 'boosting': 'gbdt', 'objective': 'binary', 'learning_rate': 0.05, 'num_threads': 0, 'seed': 'None', 'num_leaves': 30, 'max_depth': -1, 'n_estimators': 10, 'min_data_in_leaf': 20, 'feature_fraction': 0.5}
[LightGBM] [Info] Number of positive: 1220, number of negative: 1180
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 169922
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 1996
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508333 -> initscore=0.033336
[LightGBM] [Info] Start training from score 0.033336


100%|██████████| 5/5 [00:03<00:00,  1.51it/s]

save pickle file to /media/data/ijcai_2021//LGBM_N=3000_E=10_L=30_2021-01-18_22:48:49.pkl
experiment results stored successfully



