In [None]:
import pickle
import os
import lightgbm as lgb

from functions.cleaning import cleaning_rf

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import of clean data

In [None]:
df = cleaning_rf()

In [None]:
X = df.drop(columns=['EVENT_LABEL'])
y = df['EVENT_LABEL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

In [None]:
df.head()

# LightGBM

## Undersampling

### Preprocess

In [None]:
cat_labels_u_lgb = X_train.select_dtypes('object').columns
num_labels_u_lgb = X_train.select_dtypes('number').columns

under_sampler_u_lgb = RandomUnderSampler()
cat_preprocessor_u_lgb = OneHotEncoder(drop='first', handle_unknown='ignore')
num_preprocessor_u_lgb = StandardScaler()

preprocessor_u_lgb = ColumnTransformer([('cat', cat_preprocessor_u_lgb, cat_labels_u_lgb),
                                        ('num', num_preprocessor_u_lgb, num_labels_u_lgb)])

### Model

In [None]:
model_u_lgb = lgb.LGBMClassifier(n_jobs=-1, random_state=42)

### Pipeline

In [None]:
u_lgb = Pipeline([('under_sampler', under_sampler_u_lgb),
                  ('preprocessor', preprocessor_u_lgb),
                  ('LGB', model_u_lgb)])

In [None]:
results = cross_validate(u_lgb, X_train, y_train, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')

In [None]:
u_grid = {
    
}

In [None]:
u_grid_search = GridSearchCV(estimator=u_lgb, param_grid=u_grid, cv=5, scoring='recall_macro', return_train_score=True)
u_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {u_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {u_grid_search.cv_results_['mean_test_score'].mean()}')

## Oversampling

### Preprocess

In [None]:
cat_labels_o_lgb = X_train.select_dtypes('object').columns
num_labels_o_lgb = X_train.select_dtypes('number').columns

under_sampler_o_lgb = RandomOverSampler()
cat_preprocessor_o_lgb = OneHotEncoder(drop='first', handle_unknown='ignore')
num_preprocessor_o_lgb = StandardScaler()

preprocessor_o_lgb = ColumnTransformer([('cat', cat_preprocessor_o_lgb, cat_labels_o_lgb),
                                        ('num', num_preprocessor_o_lgb, num_labels_o_lgb)])

### Model

In [None]:
model_o_lgb = lgb.LGBMClassifier(n_jobs=-1, random_state=42)

### Pipeline

In [None]:
o_lgb = Pipeline([('under_sampler', under_sampler_o_lgb),
                  ('preprocessor', preprocessor_o_lgb),
                  ('LGB', model_o_lgb)])

In [None]:
results = cross_validate(o_lgb, X_train, y_train, return_train_score = True, scoring='recall_macro')
print(f'Average Train Score: {results['train_score'].mean()}')
print(f'Average Test Score: {results['test_score'].mean()}')

In [None]:
o_grid = {
    
}

In [None]:
o_grid_search = GridSearchCV(estimator=u_lgb, param_grid=o_grid, cv=5, scoring='recall_macro', return_train_score=True)
o_grid_search.fit(X_train, y_train)
print(f'Average Train Score: {o_grid_search.cv_results_['mean_train_score'].mean()}')
print(f'Average Test Score: {o_grid_search.cv_results_['mean_test_score'].mean()}')

# Export

In [None]:
with open('models/under_lgb.pkl', 'wb') as f:
    pickle.dump(u_grid_search, f)
f.close()

In [None]:
with open('models/over_lgb.pkl', 'wb') as f:
    pickle.dump(o_grid_search, f)
f.close()