In [1]:
import pickle
import lightgbm as lgb

from functions.cleaning import cleaning_rf
from functions.preprocessor import preprocessor

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import of clean data

In [2]:
df = cleaning_rf()

In [3]:
X = df.drop(columns=['EVENT_LABEL'])
y = df['EVENT_LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [4]:
df.head()

Unnamed: 0,transaction_amt,transaction_adj_amt,historic_velocity,currency,cvv,signature_image,transaction_type,transaction_env,tranaction_initiate,inital_amount,EVENT_LABEL,day,month,browser,os,acc_age,d_last_logon
0,2167.0,56.0,2572.0,cad,D,F,U,X,O,13646.0,0,Tue,Dec,Opera,Windows,4,3
1,2045.0,48.0,4517.0,cad,X,X,H,W,J,11930.0,0,Tue,Jun,Opera,Linux,3,1
2,2892.0,61.0,5007.0,cad,X,Q,X,X,T,7412.0,0,Mon,May,Mozilla,Linux,7,5
3,3040.0,28.0,6022.0,usd,G,G,C,N,M,4623.0,0,Thu,Mar,Mozilla,Macintosh,3,2
4,2976.0,66.0,2600.0,cad,X,F,F,G,K,1905.0,0,Sat,Mar,Mozilla,Linux,4,2


## Preprocessor

In [None]:
preprocessor = preprocessor(X_train)

## Model

In [None]:
model_lgb = lgb.LGBMClassifier(n_jobs=-1, random_state=42)

# LightGBM

## Undersampling

### Under sampler

In [5]:
under_sampler = RandomUnderSampler()

### Pipeline

In [7]:
u_lgb = Pipeline([('under_sampler', under_sampler),
                  ('preprocessor', preprocessor),
                  ('LGB', model_lgb)])

In [8]:
results = cross_validate(u_lgb, X_train, y_train, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')

[LightGBM] [Info] Number of positive: 4536, number of negative: 4536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1106
[LightGBM] [Info] Number of data points in the train set: 9072, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 4537, number of negative: 4537
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1101
[LightGBM] [Info] Number of data points in the train set: 9074, number of used features: 129
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 4537, number of negative: 4537
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000597 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1104
[LightGBM] [Info] Number of data points in the train set: 9074, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 4537, number of negative: 4537
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1104
[LightGBM] [Info] Number of data points in the train set: 9074, number of used features: 130
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 4537, number of negative: 4537
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 9074, number of used features: 131
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




Average Train Score: 0.9179560861243587
Average Test Score: 0.898260555073319


In [9]:
u_grid = {
    'LGB__learning_rate': [0.005, 0.01],
    'LGB__n_estimators': [8,16,24],
    'LGB__num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'LGB__objective' : ['binary'],
    'LGB__max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'LGB__random_state' : [500],
    'LGB__colsample_bytree' : [0.64, 0.65, 0.66],
    'LGB__subsample' : [0.7,0.75],
    'LGB__verbose': [-1]
    }

In [10]:
u_grid_search = GridSearchCV(estimator=u_lgb, param_grid=u_grid, cv=5, scoring='recall_macro', return_train_score=True)
u_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {u_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {u_grid_search.cv_results_['mean_test_score'].mean()}')



Average Train Score: 0.8589954162307742
Average Test Score: 0.8572559210360366


## Oversampling

### Over sampler

In [11]:
under_sampler = RandomOverSampler()

### Pipeline

In [13]:
o_lgb = Pipeline([('under_sampler', under_sampler),
                  ('preprocessor', preprocessor),
                  ('LGB', model_lgb)])

In [14]:
results = cross_validate(o_lgb, X_train, y_train, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')

[LightGBM] [Info] Number of positive: 78922, number of negative: 78922
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003808 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1162
[LightGBM] [Info] Number of data points in the train set: 157844, number of used features: 152
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 78921, number of negative: 78921
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003099 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1164
[LightGBM] [Info] Number of data points in the train set: 157842, number of used features: 154
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 78921, number of negative: 78921
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1166
[LightGBM] [Info] Number of data points in the train set: 157842, number of used features: 154
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore

In [17]:

o_grid = {
    'LGB__learning_rate': [0.005, 0.01],
    'LGB__n_estimators': [8,16,24],
    'LGB__num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'LGB__objective' : ['binary'],
    'LGB__max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'LGB__random_state' : [500],
    'LGB__subsample' : [0.7,0.75],
    'LGB__verbose':[-1]
    }

In [18]:
o_grid_search = GridSearchCV(estimator=u_lgb, param_grid=o_grid, cv=5, scoring='recall_macro', return_train_score=True)
o_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {o_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {o_grid_search.cv_results_['mean_test_score'].mean()}')



Average Train Score: 0.8488078924558358
Average Test Score: 0.8467472699393911


# Export

In [19]:
with open('models/under_lgb.pkl', 'wb') as f:
    pickle.dump(u_grid_search, f)
f.close()

In [20]:
with open('models/over_lgb.pkl', 'wb') as f:
    pickle.dump(o_grid_search, f)
f.close()