In [6]:
import pandas as pd
import numpy as np
import pickle
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [7]:
# Load data
train = pd.read_pickle('data/sampled_train.pkl')
validation = pd.read_pickle('data/smote_val.pkl')

In [8]:
# Split X and y
X_train = train.drop('fraud_bool', axis=1)
y_train = train['fraud_bool']
X_validation = validation.drop('fraud_bool', axis=1)
y_validation = validation['fraud_bool']

In [9]:
# Create lgbm datasets
train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_validation, label=y_validation)

In [None]:
# Optuna hyperparameter tuning
"""import optuna

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0, log=True),
    }
    
    gbm = lgb.train(param, train_data, valid_sets=[validation_data])
    preds = gbm.predict(X_validation)
    pred_labels = np.rint(preds)
    accuracy = roc_auc_score(y_validation, pred_labels)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

best_params = study.best_params
print(best_params)"""

In [None]:
# Train model
params = {"objective": "binary", "metric": "AUC", 'num_leaves': 230, 'max_depth': 25, 'learning_rate': 0.07101316862507606, 'feature_fraction': 0.25424329182152305}
model = lgb.LGBMClassifier(**params)
model.fit(X_train, y_train, eval_set=(X_validation, y_validation))

In [None]:
# Save model
pickle.dump(model, open('lgbm_model.pkl', 'wb'))

In [14]:
# Get feature importances
feature_importances = model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

                             Feature  Importance
8                        velocity_4w        1937
22                days_since_request        1919
23                      zip_count_4w        1638
7                       velocity_24h        1542
6                        velocity_6h        1531
1              name_email_similarity        1491
5             intended_balcon_amount        1343
18         session_length_in_minutes        1332
11                 credit_risk_score        1300
9               bank_branch_count_8w        1174
10  date_of_birth_distinct_emails_4w        1021
3       current_address_months_count         912
15                 bank_months_count         901
21            income_to_credit_limit         864
0                             income         426
4                       customer_age         394
24             proposed_credit_limit         313
48                   device_os_other         205
46                   device_os_linux         198
20         device_di