# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder

import functions
import importlib
importlib.reload(functions)

import time

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [3]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)

install_pay = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\installments_payments.csv",
    index_col=False
)

app_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Variables**

In [4]:
random_state = 101
target = 'TARGET'

## **Remove Empty Features**

In [5]:
import pandas as pd
from feature_engine.selection import DropConstantFeatures
X = pd.DataFrame(dict(x1 = [1,1,1,1],
                    x2 = ["a", "a", "b", "c"],
                    x3 = [True, False, False, True]))
dcf = DropConstantFeatures()
dcf.fit_transform(X)

Unnamed: 0,x2,x3
0,a,True
1,a,False
2,b,False
3,c,True


## **Drop Collinear Features**

In [None]:
dcf = DropCorrelatedFeatures(threshold=0.7)
previous_apps = dcf.fit_transform(previous_apps)
app_train = dcf.fit_transform(app_train)

## **Reduce Memory Usage**

In [43]:
previous_apps = functions.reduce_memory_usage(previous_apps)

app_train = functions.reduce_memory_usage(app_train)

Memory usage of dataframe is 407.77 MB
Memory usage after optimization is: 277.15 MB
Decreased by 32.0%
Memory usage of dataframe is 187.69 MB
Memory usage after optimization is: 67.74 MB
Decreased by 63.9%


## **Missing Values**

In [34]:
functions.MissingValues(previous_apps)

Unnamed: 0,NumberMissing,PercentageMissing,DataType
RATE_INTEREST_PRIMARY,1664263,99.64,float16
RATE_INTEREST_PRIVILEGED,1664263,99.64,float16
RATE_DOWN_PAYMENT,895844,53.64,float16
AMT_DOWN_PAYMENT,895844,53.64,float32
NAME_TYPE_SUITE,820405,49.12,object
DAYS_LAST_DUE,673065,40.3,float32
DAYS_FIRST_DUE,673065,40.3,float32
DAYS_FIRST_DRAWING,673065,40.3,float32
NFLAG_INSURED_ON_APPROVAL,673065,40.3,float16
AMT_ANNUITY,372235,22.29,float32


## **Drop Features (More than 50% Missing)**

In [None]:
columns = ['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'RATE_DOWN_PAYMENT', 'AMT_DOWN_PAYMENT']
previous_apps = previous_apps.drop(columns, axis=1)

In [13]:
num_value = -99999

for col in previous_apps.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    previous_apps[col].fillna(num_value, inplace=True)

In [14]:
num_value = -99999

for col in app_train.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    app_train[col].fillna(num_value, inplace=True)

In [15]:
cat_value = 'UNKNOWN'

for col in previous_apps.select_dtypes(include=['object']).columns:
    previous_apps[col].fillna(cat_value, inplace=True)

In [16]:
cat_value = 'UNKNOWN'

for col in app_train.select_dtypes(include=['object']).columns:
    app_train[col].fillna(cat_value, inplace=True)

## **Drop Constant Features**

In [None]:
dcf = DropConstantFeatures()
dcf.fit_transform(app_train)

## **Aggregation**

In [30]:
aggregated_previous_app = previous_apps.groupby('SK_ID_CURR').agg({
    'SK_ID_PREV': 'count',  
    'AMT_ANNUITY': 'mean',    
    'DAYS_DECISION': ['mean', 'max', 'min'],  
    'CNT_PAYMENT': ['sum'],
    'DAYS_FIRST_DUE': lambda x: x.max() - x.min(),
    'DAYS_LAST_DUE': lambda x: x.max() - x.min()
}).reset_index()

# Flatten multi-level columns generated by the aggregations
aggregated_previous_app.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in aggregated_previous_app.columns]

aggregated_previous_app = aggregated_previous_app.rename(columns={
    'SK_ID_CURR_': 'SK_ID_CURR', 
    'SK_ID_PREV_count': 'NUM_PREVIOUS_APPLICATIONS',
    'AMT_ANNUITY_mean': 'AVG_ANNUITY_AMOUNT',
    'DAYS_DECISION_mean': 'AVG_DAYS_DECISION',
    'DAYS_DECISION_max': 'MAX_DAYS_DECISION',
    'DAYS_DECISION_min': 'MIN_DAYS_DECISION',
    'CNT_PAYMENT_sum': 'SUM_CNT_PAYMENT',
    'DAYS_FIRST_DUE_<lambda>': 'RANGE_DAYS_FIRST_DUE',
    'DAYS_LAST_DUE_<lambda>': 'RANGE_DAYS_LAST_DUE',
})

aggregated_previous_app.head()

Unnamed: 0,SK_ID_CURR,NUM_PREVIOUS_APPLICATIONS,AVG_ANNUITY_AMOUNT,AVG_DAYS_DECISION,MAX_DAYS_DECISION,MIN_DAYS_DECISION,SUM_CNT_PAYMENT,RANGE_DAYS_FIRST_DUE,RANGE_DAYS_LAST_DUE
0,100001,1,3951.0,-1740.0,-1740,-1740,8.0,0.0,0.0
1,100002,1,9251.775,-606.0,-606,-606,24.0,0.0,0.0
2,100003,3,56553.99,-1305.0,-746,-2341,30.0,1594.0,1444.0
3,100004,1,5357.25,-815.0,-815,-815,4.0,0.0,0.0
4,100005,2,-47592.9,-536.0,-315,-757,-99987.0,99293.0,99533.0


## **Merge Application Train and Previous Application**

In [31]:
data = app_train.merge(aggregated_previous_app, on='SK_ID_CURR', how='left')

## **Drop Columns**

In [32]:
columns = ['FLAG_DOCUMENT_5', 
                    'FLAG_DOCUMENT_17',
                    'FLAG_DOCUMENT_15',
                    'FLAG_DOCUMENT_12',
                    'FLAG_DOCUMENT_14',
                    'FLAG_DOCUMENT_2',
                    'FLAG_DOCUMENT_10',
                    'FLAG_DOCUMENT_20',
                    'FLAG_DOCUMENT_19',
                   'LIVE_CITY_NOT_WORK_CITY', 
                   'AMT_REQ_CREDIT_BUREAU_DAY',
                   'FLAG_DOCUMENT_8',
                   'FLAG_MOBIL',
                   'FLAG_CONT_MOBILE',
                   'REG_REGION_NOT_LIVE_REGION',
                   'FLAG_DOCUMENT_9',
                   'FLAG_DOCUMENT_4',
                   'FLAG_DOCUMENT_7',
                   'FLAG_EMP_PHONE',
                   'REG_REGION_NOT_WORK_REGION',
                   'HOUSETYPE_MODE',
                   'FLOORSMIN_MODE',
                    'FLOORSMIN_MEDI',
                    'ENTRANCES_MEDI',
                    'FLOORSMAX_MODE'
                   ] 
data = data.drop(columns=columns)

## **WoE Encoder**

In [33]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Train Test Split**

In [34]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [35]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [36]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
36,EXT_SOURCE_3,191
35,EXT_SOURCE_2,180
34,EXT_SOURCE_1,166
16,DAYS_BIRTH,159
7,AMT_CREDIT,136
8,AMT_ANNUITY,124
101,SUM_CNT_PAYMENT,112
9,AMT_GOODS_PRICE,104
17,DAYS_EMPLOYED,99
100,MIN_DAYS_DECISION,89


## **Drop Columns**

In [None]:
columns = ['FLAG_DOCUMENT_5', 
                    'FLAG_DOCUMENT_17',
                    'FLAG_DOCUMENT_15',
                    'FLAG_DOCUMENT_12',
                    'FLAG_DOCUMENT_14',
                    'FLAG_DOCUMENT_2',
                    'FLAG_DOCUMENT_10',
                    'FLAG_DOCUMENT_20',
                    'FLAG_DOCUMENT_19',
                   'LIVE_CITY_NOT_WORK_CITY', 
                   'AMT_REQ_CREDIT_BUREAU_DAY',
                   'FLAG_DOCUMENT_8',
                   'FLAG_MOBIL',
                   'FLAG_CONT_MOBILE',
                   'REG_REGION_NOT_LIVE_REGION',
                   'FLAG_DOCUMENT_9',
                   'FLAG_DOCUMENT_4',
                   'FLAG_DOCUMENT_7',
                   'FLAG_EMP_PHONE',
                   'REG_REGION_NOT_WORK_REGION',
                   'HOUSETYPE_MODE',
                   'FLOORSMIN_MODE',
                    'FLOORSMIN_MEDI',
                    'ENTRANCES_MEDI',
                    'FLOORSMAX_MODE'
                   ] 
data = data.drop(columns=columns)

## **Recurssive Feature Elimination**

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from feature_engine.selection import RecursiveFeatureElimination
X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
                    x2 = [2,4,3,1,2,2],
                    x3 = [1,1,1,0,0,0],
                    x4 = [1,2,1,1,0,1],
                    x5 = [1,1,1,1,1,1]))
y = pd.Series([1,0,0,1,1,0])
rfe = RecursiveFeatureElimination(RandomForestClassifier(random_state=2), cv=2)
rfe.fit_transform(X, y)

Unnamed: 0,x2
0,2
1,4
2,3
3,1
4,2
5,2


In [None]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from feature_engine.selection import RecursiveFeatureElimination
from sklearn.metrics import make_scorer, roc_auc_score

# Load a sample dataset (Breast cancer dataset)
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the model for feature selection (Logistic Regression)
model = LogisticRegression(max_iter=1000, random_state=42)

# Define a custom scorer using AUC
auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# Initialize Recursive Feature Elimination with AUC as scoring
rfe = RecursiveFeatureElimination(
    estimator=model,
    variables=None,  # If None, RFE will evaluate all numerical features
    scoring=auc_scorer,  # Use AUC for scoring
    threshold=0.01,  # Feature importance threshold to drop variables
    cv=3,  # Number of cross-validation folds
    verbose=1,  # Display the process
)

# Fit the RFE
rfe.fit(X_train, y_train)

# Transform the datasets to retain only selected features
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Display results
print("Original number of features:", X_train.shape[1])
print("Selected number of features:", X_train_selected.shape[1])


In [6]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from feature_engine.selection import RecursiveFeatureElimination
from sklearn.metrics import make_scorer, roc_auc_score

# Load a sample dataset (Breast cancer dataset)
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the model for feature selection (Logistic Regression)
model = LogisticRegression(max_iter=1000, random_state=42)

# Define a custom scorer using AUC with response_method='predict_proba' (for ROC AUC scoring)
auc_scorer = make_scorer(roc_auc_score, response_method='predict_proba')

# Initialize Recursive Feature Elimination with AUC as scoring
rfe = RecursiveFeatureElimination(
    estimator=model,
    variables=None,  # If None, RFE will evaluate all numerical features
    scoring=auc_scorer,  # Use AUC for scoring
    threshold=0.01,  # Feature importance threshold to drop variables
    cv=3  # Number of cross-validation folds
    n_features_to_select=3
)

# Fit the RFE
rfe.fit(X_train, y_train)

# Transform the datasets to retain only selected features
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Display results
print("Original number of features:", X_train.shape[1])
print("Selected number of features:", X_train_selected.shape[1])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Original number of features: 30
Selected number of features: 1
