# **Libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [3]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")

warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

# **Load Data**

In [4]:
pd.set_option('use_inf_as_na', True)

data = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\Data\Data\data.csv",
    index_col=False
)

## **Reduce Memory Usage**

In [4]:
data = functions.reduce_memory_usage(data)

Memory usage of dataframe is 296.02 MB
Memory usage after optimization is: 104.37 MB
Decreased by 64.7%


## **Variables**

In [5]:
random_state = 101
target = 'TARGET'

## **Imputation**

In [6]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(data)
data = ani.transform(data)

In [7]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(data)
data = ci.transform(data)

## **WoE Encoder**

In [8]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Train Test Split**

In [9]:
X = data.drop(target, axis=1)
y = data[target]

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [18]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.77


## **Feature Importance**

In [None]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

importance_df

## **Recursive Feature Elimination**

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svc = SVC(kernel="linear", probability=True)  

rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='roc_auc')
rfecv.fit(X_train, y_train)


print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Selected features: {rfecv.support_}")
print(f"Feature ranking: {rfecv.ranking_}")


y_prob = rfecv.predict_proba(X_test)[:, 1] 
auc = roc_auc_score(y_test, y_prob)

print(f"Test set AUC: {auc:.4f}")

plt.figure()
plt.title("RFECV: Recursive Feature Elimination with Cross-Validation")
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross Validation Score (AUC)")
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'], marker='o')
plt.axvline(rfecv.n_features_, linestyle='--', color='red')  
plt.show()


In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assume you have X_train, X_test as numpy arrays and y_train as the target (could be a numpy array or Series)
# If you have feature names separately, store them in a list:
feature_names = [...]  # List of column names corresponding to X_train

# Standardize the data using StandardScaler
scaler = StandardScaler()

# Apply scaling to X_train and X_test (they are numpy arrays)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize LightGBM Classifier
lgb_clf = LGBMClassifier()

# Recursive Feature Elimination with Cross-Validation (RFECV)
rfecv = RFECV(estimator=lgb_clf, step=1, cv=5, scoring='roc_auc')

# Fit RFECV with the scaled training data
rfecv.fit(X_train_scaled, y_train)

# Print the optimal number of features
print("Optimal number of features (LightGBM): ", rfecv.n_features_)

# Plot the cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (roc_auc)")

# Use either grid_scores_ or cv_results_['mean_test_score'] based on the sklearn version
if hasattr(rfecv, 'grid_scores_'):
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
else:
    plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])

plt.show()

# If feature_names is defined (as a list of column names), use it to get the selected features
if feature_names:
    selected_features_lgb = np.array(feature_names)[rfecv.support_]
    print("Selected features: ", selected_features_lgb)
else:
    print("Selected feature indices: ", np.where(rfecv.support_)[0])


In [None]:
# Assume you have X_train, X_test as numpy arrays and y_train as the target
# If you have feature names separately, store them in a list:
feature_names = [...]  # List of column names corresponding to X_train


lgb_clf = LGBMClassifier(verbosity=-1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfecv = RFECV(estimator=lgb_clf, step=1, cv=5, scoring='roc_auc')

rfecv.fit(X_train_scaled, y_train)

print("Optimal number of features (LightGBM): ", rfecv.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (roc_auc)")

if hasattr(rfecv, 'grid_scores_'):
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
else:
    plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])

plt.show()

if feature_names:
    selected_features_lgb = np.array(feature_names)[rfecv.support_]
    print("Selected features: ", selected_features_lgb)
else:
    print("Selected feature indices: ", np.where(rfecv.support_)[0])
