In [18]:
import pickle

from functions.cleaning import cleaning_rf
from functions.preprocessor import preprocessor
from functions.preprocessor_t import preprocessor_t


import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Import of clean data

In [19]:
df = cleaning_rf()

In [20]:
X = df.drop(columns=['EVENT_LABEL'])
y = df['EVENT_LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0, stratify=y)

In [4]:
df.head()

Unnamed: 0,transaction_amt,transaction_adj_amt,historic_velocity,currency,cvv,signature_image,transaction_type,transaction_env,tranaction_initiate,inital_amount,EVENT_LABEL,day,month,browser,os,acc_age,d_last_logon
0,2167.0,56.0,2572.0,cad,D,F,U,X,O,13646.0,0,Tue,Dec,Opera,Windows,4,3
1,2045.0,48.0,4517.0,cad,X,X,H,W,J,11930.0,0,Tue,Jun,Opera,Linux,3,1
2,2892.0,61.0,5007.0,cad,X,Q,X,X,T,7412.0,0,Mon,May,Mozilla,Linux,7,5
3,3040.0,28.0,6022.0,usd,G,G,C,N,M,4623.0,0,Thu,Mar,Mozilla,Macintosh,3,2
4,2976.0,66.0,2600.0,cad,X,F,F,G,K,1905.0,0,Sat,Mar,Mozilla,Linux,4,2


## Preprocessor

In [21]:
preprocessor = preprocessor(X_train)
preprocessor_t = preprocessor_t(X_train)

## Model

In [22]:
RandomForest = RandomForestClassifier(n_jobs=-1, random_state=42)

# Random forest classifier

## Undersampling

### Under sampler

In [23]:
under_sampler = RandomUnderSampler()

### Pipeline

In [24]:
u_rf = Pipeline([('under_sampler', under_sampler),
                 ('preprocessor', preprocessor_t),
                 ('RandomForest', RandomForest)])

In [9]:
results = cross_validate(u_rf, X_train, y_train,cv=5, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')



Average Train Score: 0.963662173936559
Average Test Score: 0.8964692155168958


In [25]:
u_grid = {'RandomForest__max_depth': [5, 8, 10, 15],
          'RandomForest__max_features': [3, 6, 9, 13],
          'RandomForest__min_samples_split': [10, 20, 25],
          'RandomForest__min_samples_leaf': [10, 15]}

In [26]:
u_grid_search = GridSearchCV(estimator=u_rf, param_grid=u_grid, cv=5, scoring='recall_macro', return_train_score=True)
u_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {u_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {u_grid_search.cv_results_['mean_test_score'].mean()}')

Average Train Score: 0.8610666499645032
Average Test Score: 0.8524938119208914


In [27]:
u_grid_search.best_estimator_

## Oversampling

### Over sampler

In [13]:
over_sampler = SMOTE()

### Pipeline

In [14]:
o_rf = Pipeline([('preprocessor', preprocessor),
                 ('over_sampler', over_sampler),
                 ('RandomForest', RandomForest)])

In [15]:
results = cross_validate(o_rf, X_train, y_train, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')



Average Train Score: 1.0
Average Test Score: 0.7981878284095776


In [16]:
o_grid = {'RandomForest__max_depth': [5, 8, 12],
          'RandomForest__max_features': [6, 9, 13],
          'RandomForest__min_samples_split': [100, 200],
          'RandomForest__min_samples_leaf': [50, 100]}

In [None]:
o_grid_search = GridSearchCV(estimator=o_rf, param_grid=o_grid, cv=3, scoring='recall_macro', return_train_score=True)
o_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {o_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {o_grid_search.cv_results_['mean_test_score'].mean()}')

In [1]:
o_grid_search.best_estimator_

NameError: name 'o_grid_search' is not defined

# Export

In [28]:
with open('models/under_rf.pkl', 'wb') as f:
    pickle.dump(u_grid_search, f)
f.close()

In [None]:
with open('models/over_rf.pkl', 'wb') as f:
    pickle.dump(o_grid_search, f)
f.close()