In [1]:
import pickle
import os
import xgboost as xgb

from functions.cleaning import cleaning_rf

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import of clean data

In [2]:
df = cleaning_rf()

In [3]:
X = df.drop(columns=["EVENT_LABEL"])
y = df['EVENT_LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3 ,stratify=y, random_state=0)

In [4]:
df.head()

Unnamed: 0,transaction_amt,transaction_adj_amt,historic_velocity,currency,cvv,signature_image,transaction_type,transaction_env,tranaction_initiate,inital_amount,EVENT_LABEL,day,month,browser,os,acc_age,d_last_logon
0,2167.0,56.0,2572.0,cad,D,F,U,X,O,13646.0,0,Tue,Dec,Opera,Windows,4,3
1,2045.0,48.0,4517.0,cad,X,X,H,W,J,11930.0,0,Tue,Jun,Opera,Linux,3,1
2,2892.0,61.0,5007.0,cad,X,Q,X,X,T,7412.0,0,Mon,May,Mozilla,Linux,7,5
3,3040.0,28.0,6022.0,usd,G,G,C,N,M,4623.0,0,Thu,Mar,Mozilla,Macintosh,3,2
4,2976.0,66.0,2600.0,cad,X,F,F,G,K,1905.0,0,Sat,Mar,Mozilla,Linux,4,2


# XGBoost

## Undersampling

### Preprocess

In [5]:
cat_labels_u_xgb = X_train.select_dtypes('object').columns
num_labels_u_xgb = X_train.select_dtypes('number').columns

under_sampler_u_xgb = RandomUnderSampler()
cat_preprocessor_u_xgb = OneHotEncoder(drop='first', handle_unknown='ignore')
num_preprocessor_u_xgb = StandardScaler()

preprocessor_u_xgb = ColumnTransformer([('cat', cat_preprocessor_u_xgb, cat_labels_u_xgb),
                                        ('num', num_preprocessor_u_xgb, num_labels_u_xgb)])

### Model

In [6]:
model_u_xgb = xgb.XGBClassifier(n_jobs=-1, random_state = 42)

### Pipeline

In [7]:
u_xgb = Pipeline([('under_sampler', under_sampler_u_xgb),
                  ('preprocessor', preprocessor_u_xgb),
                  ('XGB', model_u_xgb)])

In [8]:
results = cross_validate(u_xgb, X_train, y_train, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')



Average Train Score: 0.9314369922985406
Average Test Score: 0.8940328155848579


In [9]:
u_grid = {
        'XGB__min_child_weight': [1, 5, 10],
        'XGB__gamma': [0.5, 1, 1.5, 2, 5],
        'XGB__subsample': [0.6, 0.8, 1.0],
        'XGB__colsample_bytree': [0.6, 0.8, 1.0],
        'XGB__max_depth': [3, 4, 5]
        }

In [10]:
u_grid_search = GridSearchCV(estimator=u_xgb, param_grid=u_grid, cv=5, scoring='recall_macro', return_train_score=True)
u_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {u_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {u_grid_search.cv_results_['mean_test_score'].mean()}')



Average Train Score: 0.9056274453513057
Average Test Score: 0.8943987596991461


## Oversampling

### Preprocess

In [11]:
cat_labels_o_xgb = X_train.select_dtypes('object').columns
num_labels_o_xgb = X_train.select_dtypes('number').columns

under_sampler_o_xgb = RandomUnderSampler()
cat_preprocessor_o_xgb = OneHotEncoder(drop='first', handle_unknown='ignore')
num_preprocessor_o_xgb = StandardScaler()

preprocessor_o_xgb = ColumnTransformer([('cat', cat_preprocessor_o_xgb, cat_labels_o_xgb),
                                        ('num', num_preprocessor_o_xgb, num_labels_o_xgb)])

### Model

In [12]:
model_o_xgb = xgb.XGBClassifier(n_jobs=-1, random_state = 42)

### Pipeline

In [13]:
o_xgb = Pipeline([('under_sampler', under_sampler_o_xgb),
                  ('preprocessor', preprocessor_o_xgb),
                  ('XGB', model_o_xgb)])

In [14]:
results = cross_validate(o_xgb, X_train, y_train, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')



Average Train Score: 0.9309090274167489
Average Test Score: 0.8961932176889432


In [15]:
o_grid = u_grid = {
    'XGB__min_child_weight': [1, 5, 10],
    'XGB__gamma': [0.5, 1, 1.5, 2, 5],
    'XGB__subsample': [0.6, 0.8, 1.0],        
    'XGB__colsample_bytree': [0.6, 0.8, 1.0],
    'XGB__max_depth': [3, 4, 5]
        }

In [16]:
o_grid_search = GridSearchCV(estimator=o_xgb, param_grid=o_grid, cv=5, scoring='recall_macro', return_train_score=True)
o_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {o_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {o_grid_search.cv_results_['mean_test_score'].mean()}')



Average Train Score: 0.9056018000284207
Average Test Score: 0.8943313689067824


# Export

In [17]:
with open('models/under_xgb.pkl', 'wb') as f:
    pickle.dump(u_grid_search, f)
f.close()

In [18]:
with open('models/over_xgb.pkl', 'wb') as f:
    pickle.dump(o_grid_search, f)
f.close()