# **Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

# **Load Data**

In [3]:
pd.set_option('use_inf_as_na', True)

data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\Data\data 27.csv",
    index_col=False
)

## **Random Sampling**

In [4]:
data = data.sample(frac=0.2, random_state=42)

## **Reduce Memory Usage**

In [5]:
data = functions.reduce_memory_usage(data)

Memory usage of dataframe is 19.35 MB
Memory usage after optimization is: 6.69 MB
Decreased by 65.4%


## **Variables**

In [6]:
random_state = 101
target = 'TARGET'

## **Imputation**

In [7]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(data)
data = ani.transform(data)

## **Train Test Split**

In [8]:
X = data.drop(target, axis=1)
y = data[target]

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [9]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.74


## **Feature Importance**

In [None]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

importance_df

## **Recursive Feature Elimination**

In [None]:
lgb_clf = LGBMClassifier(verbosity=-1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfecv = RFECV(estimator=lgb_clf, step=1, cv=5, scoring='roc_auc')

rfecv.fit(X_train_scaled, y_train)

print("Optimal number of features (LightGBM): ", rfecv.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (roc_auc)")

if hasattr(rfecv, 'grid_scores_'):
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
else:
    plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])

plt.show()

## **Optuna**   

In [11]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)  # Validation data for early stopping

   
    gbm = lgb.train(
        param,
        train_data,
        valid_sets=[valid_data],  
        num_boost_round=100,
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
    )

   
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    roc_auc = roc_auc_score(y_test, y_pred)
    
    return roc_auc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)


print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2024-10-15 16:43:11,871] A new study created in memory with name: no-name-66d52709-46ac-4f45-a0fa-73c1a78265da
[I 2024-10-15 16:43:15,855] Trial 0 finished with value: 0.718619020207611 and parameters: {'lambda_l1': 0.058569250566311695, 'lambda_l2': 0.021372900849422196, 'num_leaves': 151, 'feature_fraction': 0.70685573199695, 'bagging_fraction': 0.5132852451138763, 'bagging_freq': 5, 'min_child_samples': 33}. Best is trial 0 with value: 0.718619020207611.
[I 2024-10-15 16:43:20,320] Trial 1 finished with value: 0.727229341915777 and parameters: {'lambda_l1': 0.0004901202960646275, 'lambda_l2': 0.025285362182444863, 'num_leaves': 292, 'feature_fraction': 0.5970276174404913, 'bagging_fraction': 0.5812791546512959, 'bagging_freq': 4, 'min_child_samples': 57}. Best is trial 1 with value: 0.727229341915777.
[I 2024-10-15 16:43:29,324] Trial 2 finished with value: 0.7178546001657912 and parameters: {'lambda_l1': 1.5867137160199242e-08, 'lambda_l2': 2.068378927361673e-05, 'num_leaves': 1

Best trial:
  Value: 0.7413994048215757
  Params: 
    lambda_l1: 6.882925520391644e-08
    lambda_l2: 0.0715625443249791
    num_leaves: 60
    feature_fraction: 0.6242172466372449
    bagging_fraction: 0.9388421997233861
    bagging_freq: 7
    min_child_samples: 52
